ocr_validator_utils.py 40 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138
  1. """
  2. OCR验证工具的工具函数模块
  3. 包含数据处理、图像处理、统计分析等功能
  4. """
  5. import json
  6. import pandas as pd
  7. import numpy as np
  8. from pathlib import Path
  9. from PIL import Image, ImageDraw
  10. from typing import Dict, List, Optional, Tuple, Union
  11. from io import StringIO, BytesIO
  12. import re
  13. from html import unescape
  14. import yaml
  15. import base64
  16. from urllib.parse import urlparse
  17. import cv2
  18. import os
  19. def load_config(config_path: str = "config.yaml") -> Dict:
  20. """加载配置文件"""
  21. try:
  22. with open(config_path, 'r', encoding='utf-8') as f:
  23. return yaml.safe_load(f)
  24. except Exception as e:
  25. # 返回默认配置
  26. return get_default_config()
  27. def get_default_config() -> Dict:
  28. """获取默认配置 - 与config.yaml保持一致"""
  29. return {
  30. 'styles': {
  31. 'font_size': 8, # 修改:从字典改为单个数值
  32. 'colors': {
  33. 'primary': '#0288d1',
  34. 'secondary': '#ff9800',
  35. 'success': '#4caf50',
  36. 'error': '#f44336',
  37. 'warning': '#ff9800',
  38. 'background': '#fafafa',
  39. 'text': '#333333'
  40. },
  41. 'layout': {
  42. 'default_zoom': 1.0,
  43. 'default_height': 800, # 修改:从600改为800
  44. 'sidebar_width': 1, # 修改:从0.3改为1
  45. 'content_width': 0.7
  46. }
  47. },
  48. 'ui': {
  49. 'page_title': 'OCR可视化校验工具',
  50. 'page_icon': '🔍',
  51. 'layout': 'wide',
  52. 'sidebar_state': 'expanded'
  53. # 移除:default_font_size和default_layout
  54. },
  55. 'paths': {
  56. # 修改:使用config.yaml中的实际路径
  57. 'ocr_out_dir': '/Users/zhch158/workspace/data/至远彩色印刷工业有限公司/data_DotsOCR_Results',
  58. 'src_img_dir': '/Users/zhch158/workspace/data/至远彩色印刷工业有限公司/data_PPStructureV3_Results/2023年度报告母公司',
  59. 'supported_image_formats': ['.png', '.jpg', '.jpeg']
  60. },
  61. 'ocr': {
  62. 'min_text_length': 2,
  63. 'default_confidence': 1.0,
  64. 'exclude_texts': ['Picture', ''],
  65. # 新增:图片方向检测配置
  66. 'orientation_detection': {
  67. 'enabled': True,
  68. 'confidence_threshold': 0.3,
  69. 'methods': ['opencv_analysis'],
  70. 'cache_results': True
  71. },
  72. 'tools': {
  73. 'dots_ocr': {
  74. 'name': 'Dots OCR',
  75. 'description': '专业VLM OCR', # 新增描述
  76. 'json_structure': 'array',
  77. 'text_field': 'text',
  78. 'bbox_field': 'bbox',
  79. 'category_field': 'category',
  80. 'confidence_field': 'confidence', # 新增置信度字段
  81. # 新增:旋转处理配置
  82. 'rotation': {
  83. 'coordinates_are_pre_rotated': False
  84. }
  85. },
  86. 'ppstructv3': {
  87. 'name': 'PPStructV3',
  88. 'description': 'PaddleOCR PP-StructureV3', # 新增描述
  89. 'json_structure': 'object',
  90. 'parsing_results_field': 'parsing_res_list',
  91. 'text_field': 'block_content',
  92. 'bbox_field': 'block_bbox',
  93. 'category_field': 'block_label',
  94. 'confidence_field': 'confidence', # 新增置信度字段
  95. # 新增:旋转处理配置
  96. 'rotation': {
  97. 'coordinates_are_pre_rotated': True
  98. }
  99. }
  100. },
  101. 'auto_detection': {
  102. 'enabled': True,
  103. 'rules': [
  104. {
  105. 'field_exists': 'parsing_res_list',
  106. 'tool_type': 'ppstructv3'
  107. },
  108. {
  109. 'json_is_array': True,
  110. 'tool_type': 'dots_ocr'
  111. }
  112. ]
  113. }
  114. }
  115. }
  116. def load_css_styles(css_path: str = "styles.css") -> str:
  117. """加载CSS样式文件"""
  118. try:
  119. with open(css_path, 'r', encoding='utf-8') as f:
  120. return f.read()
  121. except Exception:
  122. # 返回基本样式
  123. return """
  124. .main > div { background-color: white !important; color: #333333 !important; }
  125. .stApp { background-color: white !important; }
  126. .block-container { background-color: white !important; color: #333333 !important; }
  127. """
  128. def rotate_image_and_coordinates(
  129. image: Image.Image,
  130. angle: float,
  131. coordinates_list: List[List[int]],
  132. rotate_coordinates: bool = True
  133. ) -> Tuple[Image.Image, List[List[int]]]:
  134. """
  135. 根据角度旋转图像和坐标 - 修正版本
  136. Args:
  137. image: 原始图像
  138. angle: 旋转角度(度数)
  139. coordinates_list: 坐标列表,每个坐标为[x1, y1, x2, y2]格式
  140. rotate_coordinates: 是否需要旋转坐标(针对不同OCR工具的处理方式)
  141. Returns:
  142. rotated_image: 旋转后的图像
  143. rotated_coordinates: 处理后的坐标列表
  144. """
  145. if angle == 0:
  146. return image, coordinates_list
  147. # 标准化旋转角度
  148. if angle == 270:
  149. rotation_angle = -90 # 顺时针90度
  150. elif angle == 90:
  151. rotation_angle = 90 # 逆时针90度
  152. elif angle == 180:
  153. rotation_angle = 180 # 180度
  154. else:
  155. rotation_angle = angle
  156. # 旋转图像
  157. rotated_image = image.rotate(rotation_angle, expand=True)
  158. # 如果不需要旋转坐标,直接返回原坐标
  159. if not rotate_coordinates:
  160. return rotated_image, coordinates_list
  161. # 获取原始和旋转后的图像尺寸
  162. orig_width, orig_height = image.size
  163. new_width, new_height = rotated_image.size
  164. # 计算旋转后的坐标
  165. rotated_coordinates = []
  166. for coord in coordinates_list:
  167. if len(coord) < 4:
  168. rotated_coordinates.append(coord)
  169. continue
  170. x1, y1, x2, y2 = coord[:4]
  171. # 验证原始坐标是否有效
  172. if x1 < 0 or y1 < 0 or x2 <= x1 or y2 <= y1:
  173. print(f"警告: 无效坐标 {coord}")
  174. rotated_coordinates.append([0, 0, 50, 50]) # 使用默认坐标
  175. continue
  176. # 根据旋转角度变换坐标
  177. if rotation_angle == -90: # 顺时针90度 (270度逆时针)
  178. # 变换公式: (x, y) -> (orig_height - y, x)
  179. new_x1 = orig_height - y2 # 这里是y2
  180. new_y1 = x1
  181. new_x2 = orig_height - y1 # 这里是y1
  182. new_y2 = x2
  183. elif rotation_angle == 90: # 逆时针90度
  184. # 变换公式: (x, y) -> (y, orig_width - x)
  185. new_x1 = y1
  186. new_y1 = orig_width - x2 # 这里是x2
  187. new_x2 = y2
  188. new_y2 = orig_width - x1 # 这里是x1
  189. elif rotation_angle == 180: # 180度
  190. # 变换公式: (x, y) -> (orig_width - x, orig_height - y)
  191. new_x1 = orig_width - x2
  192. new_y1 = orig_height - y2
  193. new_x2 = orig_width - x1
  194. new_y2 = orig_height - y1
  195. else: # 任意角度算法 - 修正版本
  196. # 将角度转换为弧度
  197. angle_rad = np.radians(rotation_angle)
  198. cos_angle = np.cos(angle_rad)
  199. sin_angle = np.sin(angle_rad)
  200. # 原图像中心点
  201. orig_center_x = orig_width / 2
  202. orig_center_y = orig_height / 2
  203. # 旋转后图像中心点
  204. new_center_x = new_width / 2
  205. new_center_y = new_height / 2
  206. # 将bbox的四个角点转换为相对于原图像中心的坐标
  207. corners = [
  208. (x1 - orig_center_x, y1 - orig_center_y), # 左上角
  209. (x2 - orig_center_x, y1 - orig_center_y), # 右上角
  210. (x2 - orig_center_x, y2 - orig_center_y), # 右下角
  211. (x1 - orig_center_x, y2 - orig_center_y) # 左下角
  212. ]
  213. # 应用修正后的旋转矩阵变换每个角点
  214. rotated_corners = []
  215. for x, y in corners:
  216. # 修正后的旋转矩阵: [cos(θ) sin(θ)] [x]
  217. # [-sin(θ) cos(θ)] [y]
  218. rotated_x = x * cos_angle + y * sin_angle
  219. rotated_y = -x * sin_angle + y * cos_angle
  220. # 转换回绝对坐标(相对于新图像)
  221. abs_x = rotated_x + new_center_x
  222. abs_y = rotated_y + new_center_y
  223. rotated_corners.append((abs_x, abs_y))
  224. # 从旋转后的四个角点计算新的边界框
  225. x_coords = [corner[0] for corner in rotated_corners]
  226. y_coords = [corner[1] for corner in rotated_corners]
  227. new_x1 = int(min(x_coords))
  228. new_y1 = int(min(y_coords))
  229. new_x2 = int(max(x_coords))
  230. new_y2 = int(max(y_coords))
  231. # 确保坐标在有效范围内
  232. new_x1 = max(0, min(new_width, new_x1))
  233. new_y1 = max(0, min(new_height, new_y1))
  234. new_x2 = max(0, min(new_width, new_x2))
  235. new_y2 = max(0, min(new_height, new_y2))
  236. # 确保x1 < x2, y1 < y2
  237. if new_x1 > new_x2:
  238. new_x1, new_x2 = new_x2, new_x1
  239. if new_y1 > new_y2:
  240. new_y1, new_y2 = new_y2, new_y1
  241. rotated_coordinates.append([new_x1, new_y1, new_x2, new_y2])
  242. return rotated_image, rotated_coordinates
  243. def detect_ocr_tool_type(data: Union[List, Dict], config: Dict) -> str:
  244. """自动检测OCR工具类型"""
  245. if not config['ocr']['auto_detection']['enabled']:
  246. return 'dots_ocr' # 默认类型
  247. rules = config['ocr']['auto_detection']['rules']
  248. for rule in rules:
  249. if 'field_exists' in rule:
  250. field_name = rule['field_exists']
  251. if isinstance(data, dict) and field_name in data:
  252. return rule['tool_type']
  253. if 'json_is_array' in rule:
  254. if rule['json_is_array'] and isinstance(data, list):
  255. return rule['tool_type']
  256. # 默认返回dots_ocr
  257. return 'dots_ocr'
  258. def parse_dots_ocr_data(data: List, config: Dict) -> List[Dict]:
  259. """解析Dots OCR格式的数据"""
  260. tool_config = config['ocr']['tools']['dots_ocr']
  261. parsed_data = []
  262. for item in data:
  263. if not isinstance(item, dict):
  264. continue
  265. # 提取字段
  266. text = item.get(tool_config['text_field'], '')
  267. bbox = item.get(tool_config['bbox_field'], [])
  268. category = item.get(tool_config['category_field'], 'Text')
  269. confidence = item.get(tool_config.get('confidence_field', 'confidence'),
  270. config['ocr']['default_confidence'])
  271. if text and bbox and len(bbox) >= 4:
  272. parsed_data.append({
  273. 'text': str(text).strip(),
  274. 'bbox': bbox[:4], # 确保只取前4个坐标
  275. 'category': category,
  276. 'confidence': confidence,
  277. 'source_tool': 'dots_ocr'
  278. })
  279. return parsed_data
  280. def parse_ppstructv3_data(data: Dict, config: Dict) -> List[Dict]:
  281. """解析PPStructV3格式的数据"""
  282. tool_config = config['ocr']['tools']['ppstructv3']
  283. parsed_data = []
  284. parsing_results = data.get(tool_config['parsing_results_field'], [])
  285. if not isinstance(parsing_results, list):
  286. return parsed_data
  287. for item in parsing_results:
  288. if not isinstance(item, dict):
  289. continue
  290. text = item.get(tool_config['text_field'], '')
  291. bbox = item.get(tool_config['bbox_field'], [])
  292. category = item.get(tool_config['category_field'], 'text')
  293. confidence = item.get(
  294. tool_config.get('confidence_field', 'confidence'),
  295. config['ocr']['default_confidence']
  296. )
  297. if text and bbox and len(bbox) >= 4:
  298. parsed_data.append({
  299. 'text': str(text).strip(),
  300. 'bbox': bbox[:4],
  301. 'category': category,
  302. 'confidence': confidence,
  303. 'source_tool': 'ppstructv3'
  304. })
  305. rec_texts = get_nested_value(data, tool_config.get('rec_texts_field', ''))
  306. rec_boxes = get_nested_value(data, tool_config.get('rec_boxes_field', ''))
  307. if isinstance(rec_texts, list) and isinstance(rec_boxes, list):
  308. for i, (text, box) in enumerate(zip(rec_texts, rec_boxes)):
  309. if text and isinstance(box, list) and len(box) >= 4:
  310. parsed_data.append({
  311. 'text': str(text).strip(),
  312. 'bbox': box[:4],
  313. 'category': 'OCR_Text',
  314. 'source_tool': 'ppstructv3_ocr'
  315. })
  316. return parsed_data
  317. def parse_table_recognition_v2_data(data: Dict, config: Dict) -> List[Dict]:
  318. tool_config = config['ocr']['tools']['table_recognition_v2']
  319. parsed_data = []
  320. tables = data.get(tool_config['parsing_results_field'], [])
  321. if not isinstance(tables, list):
  322. return parsed_data
  323. for item in tables:
  324. if not isinstance(item, dict):
  325. continue
  326. html_text = item.get(tool_config['text_field'], '')
  327. bbox = item.get(tool_config['bbox_field'], [])
  328. if bbox and len(bbox) >= 4:
  329. bbox = bbox[:4]
  330. else:
  331. bbox = [0, 0, 0, 0]
  332. parsed_data.append({
  333. 'text': str(html_text).strip(),
  334. 'bbox': bbox,
  335. 'category': item.get(tool_config.get('category_field', ''), 'table'),
  336. 'confidence': item.get(tool_config.get('confidence_field', ''), config['ocr']['default_confidence']),
  337. 'source_tool': 'table_recognition_v2',
  338. })
  339. rec_texts = get_nested_value(data, tool_config.get('rec_texts_field', ''))
  340. rec_boxes = get_nested_value(data, tool_config.get('rec_boxes_field', ''))
  341. if isinstance(rec_texts, list) and isinstance(rec_boxes, list):
  342. for i, (text, box) in enumerate(zip(rec_texts, rec_boxes)):
  343. if text and isinstance(box, list) and len(box) >= 4:
  344. parsed_data.append({
  345. 'text': str(text).strip(),
  346. 'bbox': box[:4],
  347. 'category': 'OCR_Text',
  348. 'source_tool': 'ppstructv3_ocr'
  349. })
  350. return parsed_data
  351. def normalize_ocr_data(raw_data: Union[List, Dict], config: Dict) -> List[Dict]:
  352. tool_type = detect_ocr_tool_type(raw_data, config)
  353. if tool_type == 'dots_ocr':
  354. return parse_dots_ocr_data(raw_data, config)
  355. if tool_type == 'ppstructv3':
  356. return parse_ppstructv3_data(raw_data, config)
  357. if tool_type == 'table_recognition_v2':
  358. return parse_table_recognition_v2_data(raw_data, config)
  359. raise ValueError(f"不支持的OCR工具类型: {tool_type}")
  360. def get_rotation_angle_from_ppstructv3(data: Dict) -> float:
  361. """从PPStructV3数据中获取旋转角度"""
  362. if 'doc_preprocessor_res' in data:
  363. doc_res = data['doc_preprocessor_res']
  364. if isinstance(doc_res, dict) and 'angle' in doc_res:
  365. return float(doc_res['angle'])
  366. return 0.0
  367. def find_image_in_multiple_locations(img_src: str, json_path: str) -> Optional[str]:
  368. """
  369. 在多个可能的位置查找图片文件
  370. """
  371. json_dir = os.path.dirname(json_path)
  372. # 可能的搜索路径
  373. search_paths = [
  374. # 相对于JSON文件的路径
  375. os.path.join(json_dir, img_src),
  376. # 相对于JSON文件父目录的路径
  377. os.path.join(os.path.dirname(json_dir), img_src),
  378. # imgs目录(常见的图片目录)
  379. os.path.join(json_dir, 'imgs', os.path.basename(img_src)),
  380. os.path.join(os.path.dirname(json_dir), 'imgs', os.path.basename(img_src)),
  381. # images目录
  382. os.path.join(json_dir, 'images', os.path.basename(img_src)),
  383. os.path.join(os.path.dirname(json_dir), 'images', os.path.basename(img_src)),
  384. # 同名目录
  385. os.path.join(json_dir, os.path.splitext(os.path.basename(json_path))[0], os.path.basename(img_src)),
  386. ]
  387. # 如果是绝对路径,也加入搜索
  388. if os.path.isabs(img_src):
  389. search_paths.insert(0, img_src)
  390. # 查找存在的文件
  391. for path in search_paths:
  392. if os.path.exists(path):
  393. return path
  394. return None
  395. def process_html_images(html_content: str, json_path: str) -> str:
  396. """
  397. 处理HTML内容中的图片引用,将本地图片转换为base64 - 增强版
  398. """
  399. import re
  400. # 匹配HTML图片标签: <img src="path" ... />
  401. img_pattern = r'<img\s+[^>]*src\s*=\s*["\']([^"\']+)["\'][^>]*/?>'
  402. def replace_html_image(match):
  403. full_tag = match.group(0)
  404. img_src = match.group(1)
  405. # 如果已经是base64或者网络链接,直接返回
  406. if img_src.startswith('data:image') or img_src.startswith('http'):
  407. return full_tag
  408. # 增强的图片查找
  409. full_img_path = find_image_in_multiple_locations(img_src, json_path)
  410. # 尝试转换为base64
  411. try:
  412. if full_img_path and os.path.exists(full_img_path):
  413. with open(full_img_path, 'rb') as img_file:
  414. img_data = img_file.read()
  415. # 获取文件扩展名确定MIME类型
  416. ext = os.path.splitext(full_img_path)[1].lower()
  417. mime_type = {
  418. '.png': 'image/png',
  419. '.jpg': 'image/jpeg',
  420. '.jpeg': 'image/jpeg',
  421. '.gif': 'image/gif',
  422. '.bmp': 'image/bmp',
  423. '.webp': 'image/webp'
  424. }.get(ext, 'image/jpeg')
  425. # 转换为base64
  426. img_base64 = base64.b64encode(img_data).decode('utf-8')
  427. data_url = f"data:{mime_type};base64,{img_base64}"
  428. # 替换src属性,保持其他属性不变
  429. updated_tag = re.sub(
  430. r'src\s*=\s*["\'][^"\']+["\']',
  431. f'src="{data_url}"',
  432. full_tag
  433. )
  434. return updated_tag
  435. else:
  436. # 文件不存在,显示详细的错误信息
  437. search_info = f"搜索路径: {img_src}"
  438. if full_img_path:
  439. search_info += f" -> {full_img_path}"
  440. error_content = f"""
  441. <div style="
  442. color: #d32f2f;
  443. border: 2px dashed #d32f2f;
  444. padding: 10px;
  445. margin: 10px 0;
  446. border-radius: 5px;
  447. background-color: #ffebee;
  448. text-align: center;
  449. ">
  450. <strong>🖼️ 图片无法加载</strong><br>
  451. <small>原始路径: {img_src}</small><br>
  452. <small>JSON文件: {os.path.basename(json_path)}</small><br>
  453. <em>请检查图片文件是否存在</em>
  454. </div>
  455. """
  456. return error_content
  457. except Exception as e:
  458. # 转换失败,返回错误信息
  459. error_content = f"""
  460. <div style="
  461. color: #f57c00;
  462. border: 2px dashed #f57c00;
  463. padding: 10px;
  464. margin: 10px 0;
  465. border-radius: 5px;
  466. background-color: #fff3e0;
  467. text-align: center;
  468. ">
  469. <strong>⚠️ 图片处理失败</strong><br>
  470. <small>文件: {img_src}</small><br>
  471. <small>错误: {str(e)}</small>
  472. </div>
  473. """
  474. return error_content
  475. # 替换所有HTML图片标签
  476. processed_content = re.sub(img_pattern, replace_html_image, html_content, flags=re.IGNORECASE)
  477. return processed_content
  478. def process_markdown_images(md_content: str, json_path: str) -> str:
  479. """
  480. 处理Markdown中的图片引用,将本地图片转换为base64
  481. """
  482. import re
  483. # 匹配Markdown图片语法: ![alt](path)
  484. img_pattern = r'!\[([^\]]*)\]\(([^)]+)\)'
  485. def replace_image(match):
  486. alt_text = match.group(1)
  487. img_path = match.group(2)
  488. # 如果已经是base64或者网络链接,直接返回
  489. if img_path.startswith('data:image') or img_path.startswith('http'):
  490. return match.group(0)
  491. # 处理相对路径
  492. if not os.path.isabs(img_path):
  493. # 相对于JSON文件的路径
  494. json_dir = os.path.dirname(json_path)
  495. full_img_path = os.path.join(json_dir, img_path)
  496. else:
  497. full_img_path = img_path
  498. # 尝试转换为base64
  499. try:
  500. if os.path.exists(full_img_path):
  501. with open(full_img_path, 'rb') as img_file:
  502. img_data = img_file.read()
  503. # 获取文件扩展名确定MIME类型
  504. ext = os.path.splitext(full_img_path)[1].lower()
  505. mime_type = {
  506. '.png': 'image/png',
  507. '.jpg': 'image/jpeg',
  508. '.jpeg': 'image/jpeg',
  509. '.gif': 'image/gif',
  510. '.bmp': 'image/bmp',
  511. '.webp': 'image/webp'
  512. }.get(ext, 'image/jpeg')
  513. # 转换为base64
  514. img_base64 = base64.b64encode(img_data).decode('utf-8')
  515. data_url = f"data:{mime_type};base64,{img_base64}"
  516. return f'![{alt_text}]({data_url})'
  517. else:
  518. # 文件不存在,返回原始链接但添加警告
  519. return f'![{alt_text} (文件不存在)]({img_path})'
  520. except Exception as e:
  521. # 转换失败,返回原始链接
  522. return f'![{alt_text} (加载失败)]({img_path})'
  523. # 替换所有图片引用
  524. processed_content = re.sub(img_pattern, replace_image, md_content)
  525. return processed_content
  526. def process_all_images_in_content(content: str, json_path: str) -> str:
  527. """
  528. 处理内容中的所有图片引用(包括Markdown和HTML格式)
  529. """
  530. # 先处理HTML图片
  531. content = process_html_images(content, json_path)
  532. # 再处理Markdown图片
  533. content = process_markdown_images(content, json_path)
  534. return content
  535. # 修改 load_ocr_data_file 函数
  536. def load_ocr_data_file(json_path: str, config: Dict) -> Tuple[List, str, str]:
  537. """加载OCR数据文件 - 支持多数据源配置"""
  538. json_file = Path(json_path)
  539. if not json_file.exists():
  540. raise FileNotFoundError(f"找不到JSON文件: {json_path}")
  541. # 加载JSON数据
  542. try:
  543. with open(json_file, 'r', encoding='utf-8') as f:
  544. raw_data = json.load(f)
  545. # 统一数据格式
  546. ocr_data = normalize_ocr_data(raw_data, config)
  547. # 检查是否需要处理图像旋转
  548. rotation_angle = 0.0
  549. if isinstance(raw_data, dict):
  550. rotation_angle = get_rotation_angle_from_ppstructv3(raw_data)
  551. # 如果有旋转角度,记录下来供后续图像处理使用
  552. if rotation_angle != 0:
  553. for item in ocr_data:
  554. item['rotation_angle'] = rotation_angle
  555. except Exception as e:
  556. raise Exception(f"加载JSON文件失败: {e}")
  557. # 加载MD文件
  558. md_file = json_file.with_suffix('.md')
  559. md_content = ""
  560. if md_file.exists():
  561. with open(md_file, 'r', encoding='utf-8') as f:
  562. md_content = f.read()
  563. # 查找对应的图片文件
  564. image_path = find_corresponding_image(json_file, config)
  565. return ocr_data, md_content, image_path
  566. def find_corresponding_image(json_file: Path, config: Dict) -> str:
  567. """查找对应的图片文件 - 支持多数据源"""
  568. # 从配置中获取图片目录
  569. src_img_dir = config.get('paths', {}).get('src_img_dir', '')
  570. if not src_img_dir:
  571. # 如果没有配置图片目录,尝试在JSON文件同级目录查找
  572. src_img_dir = json_file.parent
  573. src_img_path = Path(src_img_dir)
  574. # 支持多种图片格式
  575. image_extensions = ['.jpg', '.jpeg', '.png', '.bmp', '.tiff', '.tif']
  576. for ext in image_extensions:
  577. image_file = src_img_path / f"{json_file.stem}{ext}"
  578. if image_file.exists():
  579. return str(image_file)
  580. # 如果找不到,返回空字符串
  581. return ""
  582. def process_ocr_data(ocr_data: List, config: Dict) -> Dict[str, List]:
  583. """处理OCR数据,建立文本到bbox的映射"""
  584. text_bbox_mapping = {}
  585. exclude_texts = config['ocr']['exclude_texts']
  586. min_text_length = config['ocr']['min_text_length']
  587. if not isinstance(ocr_data, list):
  588. return text_bbox_mapping
  589. for i, item in enumerate(ocr_data):
  590. if not isinstance(item, dict):
  591. continue
  592. text = str(item['text']).strip()
  593. if text and text not in exclude_texts and len(text) >= min_text_length:
  594. bbox = item['bbox']
  595. if isinstance(bbox, list) and len(bbox) == 4:
  596. if text not in text_bbox_mapping:
  597. text_bbox_mapping[text] = []
  598. text_bbox_mapping[text].append({
  599. 'bbox': bbox,
  600. 'category': item.get('category', 'Text'),
  601. 'index': i,
  602. 'confidence': item.get('confidence', config['ocr']['default_confidence']),
  603. 'source_tool': item.get('source_tool', 'unknown'),
  604. 'rotation_angle': item.get('rotation_angle', 0.0) # 添加旋转角度信息
  605. })
  606. return text_bbox_mapping
  607. def find_available_ocr_files(ocr_out_dir: str) -> List[str]:
  608. """查找可用的OCR文件"""
  609. available_files = []
  610. # 搜索多个可能的目录
  611. search_dirs = [
  612. Path(ocr_out_dir),
  613. ]
  614. for search_dir in search_dirs:
  615. if search_dir.exists():
  616. # 递归搜索JSON文件
  617. for json_file in search_dir.rglob("*.json"):
  618. if re.match(r'.*_page_\d+\.json$', json_file.name, re.IGNORECASE):
  619. available_files.append(str(json_file))
  620. # 去重并排序
  621. # available_files = sorted(list(set(available_files)))
  622. # 解析文件名并提取页码信息
  623. file_info = []
  624. for file_path in available_files:
  625. file_name = Path(file_path).stem
  626. # 提取页码 (例如从 "2023年度报告母公司_page_001" 中提取 "001")
  627. if 'page_' in file_name:
  628. try:
  629. page_part = file_name.split('page_')[-1]
  630. page_num = int(page_part)
  631. file_info.append({
  632. 'path': file_path,
  633. 'page': page_num,
  634. 'display_name': f"第{page_num}页"
  635. })
  636. except ValueError:
  637. # 如果无法解析页码,使用文件名
  638. file_info.append({
  639. 'path': file_path,
  640. 'page': len(file_info) + 1,
  641. 'display_name': Path(file_path).stem
  642. })
  643. else:
  644. # 对于没有page_的文件,按顺序编号
  645. file_info.append({
  646. 'path': file_path,
  647. 'page': len(file_info) + 1,
  648. 'display_name': Path(file_path).stem
  649. })
  650. # 按页码排序
  651. file_info.sort(key=lambda x: x['page'])
  652. return file_info
  653. def get_ocr_tool_info(ocr_data: List) -> Dict:
  654. """获取OCR工具信息统计"""
  655. tool_counts = {}
  656. for item in ocr_data:
  657. if isinstance(item, dict):
  658. source_tool = item.get('source_tool', 'unknown')
  659. tool_counts[source_tool] = tool_counts.get(source_tool, 0) + 1
  660. return tool_counts
  661. def draw_bbox_on_image(image: Image.Image, bbox: List[int], color: str = "red", width: int = 3) -> Image.Image:
  662. """在图片上绘制bbox框"""
  663. img_copy = image.copy()
  664. draw = ImageDraw.Draw(img_copy)
  665. x1, y1, x2, y2 = bbox
  666. # 绘制矩形框
  667. draw.rectangle([x1, y1, x2, y2], outline=color, width=width)
  668. # 添加半透明填充
  669. overlay = Image.new('RGBA', img_copy.size, (0, 0, 0, 0))
  670. overlay_draw = ImageDraw.Draw(overlay)
  671. color_map = {
  672. "red": (255, 0, 0, 30),
  673. "blue": (0, 0, 255, 30),
  674. "green": (0, 255, 0, 30)
  675. }
  676. fill_color = color_map.get(color, (255, 255, 0, 30))
  677. overlay_draw.rectangle([x1, y1, x2, y2], fill=fill_color)
  678. img_copy = Image.alpha_composite(img_copy.convert('RGBA'), overlay).convert('RGB')
  679. return img_copy
  680. def get_ocr_statistics(ocr_data: List, text_bbox_mapping: Dict, marked_errors: set) -> Dict:
  681. """获取OCR数据统计信息"""
  682. if not isinstance(ocr_data, list) or not ocr_data:
  683. return {
  684. 'total_texts': 0, 'clickable_texts': 0, 'marked_errors': 0,
  685. 'categories': {}, 'accuracy_rate': 0, 'tool_info': {}
  686. }
  687. total_texts = len(ocr_data)
  688. clickable_texts = len(text_bbox_mapping)
  689. marked_errors_count = len(marked_errors)
  690. # 按类别统计
  691. categories = {}
  692. for item in ocr_data:
  693. if isinstance(item, dict):
  694. category = item.get('category', 'Unknown')
  695. categories[category] = categories.get(category, 0) + 1
  696. # OCR工具信息统计
  697. tool_info = get_ocr_tool_info(ocr_data)
  698. accuracy_rate = (clickable_texts - marked_errors_count) / clickable_texts * 100 if clickable_texts > 0 else 0
  699. return {
  700. 'total_texts': total_texts,
  701. 'clickable_texts': clickable_texts,
  702. 'marked_errors': marked_errors_count,
  703. 'categories': categories,
  704. 'accuracy_rate': accuracy_rate,
  705. 'tool_info': tool_info
  706. }
  707. def convert_html_table_to_markdown(content: str) -> str:
  708. """将HTML表格转换为Markdown表格格式 - 支持横向滚动的增强版本"""
  709. def replace_table(match):
  710. table_html = match.group(0)
  711. # 提取所有行
  712. rows = re.findall(r'<tr[^>]*>(.*?)</tr>', table_html, re.DOTALL | re.IGNORECASE)
  713. if not rows:
  714. return table_html
  715. markdown_rows = []
  716. max_cols = 0
  717. # 处理所有行,找出最大列数
  718. processed_rows = []
  719. for row in rows:
  720. # 提取单元格,支持 th 和 td
  721. cells = re.findall(r'<t[hd][^>]*>(.*?)</t[hd]>', row, re.DOTALL | re.IGNORECASE)
  722. if cells:
  723. clean_cells = []
  724. for cell in cells:
  725. cell_text = re.sub(r'<[^>]+>', '', cell).strip()
  726. cell_text = unescape(cell_text)
  727. # 限制单元格长度,避免表格过宽
  728. if len(cell_text) > 30:
  729. cell_text = cell_text[:27] + "..."
  730. clean_cells.append(cell_text or " ") # 空单元格用空格替代
  731. processed_rows.append(clean_cells)
  732. max_cols = max(max_cols, len(clean_cells))
  733. # 统一所有行的列数
  734. for i, row_cells in enumerate(processed_rows):
  735. while len(row_cells) < max_cols:
  736. row_cells.append(" ")
  737. # 构建Markdown行
  738. markdown_row = '| ' + ' | '.join(row_cells) + ' |'
  739. markdown_rows.append(markdown_row)
  740. # 在第一行后添加分隔符
  741. if i == 0:
  742. separator = '| ' + ' | '.join(['---'] * max_cols) + ' |'
  743. markdown_rows.append(separator)
  744. # 添加滚动提示
  745. if max_cols > 8:
  746. scroll_note = "\n> 📋 **提示**: 此表格列数较多,在某些视图中可能需要横向滚动查看完整内容。\n"
  747. return scroll_note + '\n'.join(markdown_rows) if markdown_rows else table_html
  748. return '\n'.join(markdown_rows) if markdown_rows else table_html
  749. # 替换所有HTML表格
  750. converted = re.sub(r'<table[^>]*>.*?</table>', replace_table, content, flags=re.DOTALL | re.IGNORECASE)
  751. return converted
  752. def parse_html_tables(html_content: str) -> List[pd.DataFrame]:
  753. """解析HTML内容中的表格为DataFrame列表"""
  754. try:
  755. tables = pd.read_html(StringIO(html_content))
  756. return tables if tables else []
  757. except Exception:
  758. return []
  759. def create_dynamic_css(config: Dict, font_size_key: str, height: int) -> str:
  760. """根据配置动态创建CSS样式"""
  761. colors = config['styles']['colors']
  762. font_size = config['styles']['font_sizes'][font_size_key]
  763. return f"""
  764. <style>
  765. .dynamic-content {{
  766. height: {height}px;
  767. font-size: {font_size}px !important;
  768. line-height: 1.4;
  769. background-color: {colors['background']} !important;
  770. color: {colors['text']} !important;
  771. border: 1px solid #ddd;
  772. padding: 10px;
  773. border-radius: 5px;
  774. }}
  775. .highlight-selected {{
  776. background-color: {colors['success']} !important;
  777. color: white !important;
  778. }}
  779. .highlight-error {{
  780. background-color: {colors['error']} !important;
  781. color: white !important;
  782. }}
  783. </style>
  784. """
  785. def export_tables_to_excel(tables: List[pd.DataFrame], filename: str = "ocr_tables.xlsx") -> BytesIO:
  786. """导出表格数据到Excel"""
  787. output = BytesIO()
  788. with pd.ExcelWriter(output, engine='openpyxl') as writer:
  789. for i, table in enumerate(tables):
  790. table.to_excel(writer, sheet_name=f'Table_{i+1}', index=False)
  791. return output
  792. def get_table_statistics(tables: List[pd.DataFrame]) -> List[Dict]:
  793. """获取表格统计信息"""
  794. stats = []
  795. for i, table in enumerate(tables):
  796. numeric_cols = len(table.select_dtypes(include=[np.number]).columns)
  797. stats.append({
  798. 'table_index': i + 1,
  799. 'rows': len(table),
  800. 'columns': len(table.columns),
  801. 'numeric_columns': numeric_cols
  802. })
  803. return stats
  804. def group_texts_by_category(text_bbox_mapping: Dict[str, List]) -> Dict[str, List[str]]:
  805. """按类别对文本进行分组"""
  806. categories = {}
  807. for text, info_list in text_bbox_mapping.items():
  808. category = info_list[0]['category']
  809. if category not in categories:
  810. categories[category] = []
  811. categories[category].append(text)
  812. return categories
  813. def get_ocr_tool_rotation_config(ocr_data: List, config: Dict) -> Dict:
  814. """获取OCR工具的旋转配置"""
  815. if not ocr_data or not isinstance(ocr_data, list):
  816. # 默认配置
  817. return {
  818. 'coordinates_are_pre_rotated': False
  819. }
  820. # 从第一个OCR数据项获取工具类型
  821. first_item = ocr_data[0] if ocr_data else {}
  822. source_tool = first_item.get('source_tool', 'dots_ocr')
  823. # 获取工具配置
  824. tools_config = config.get('ocr', {}).get('tools', {})
  825. if source_tool in tools_config:
  826. tool_config = tools_config[source_tool]
  827. return tool_config.get('rotation', {
  828. 'coordinates_are_pre_rotated': False
  829. })
  830. else:
  831. # 默认配置
  832. return {
  833. 'coordinates_are_pre_rotated': False
  834. }
  835. def detect_image_orientation_by_opencv(image_path: str) -> Dict:
  836. """
  837. 使用OpenCV的文本检测来判断图片方向
  838. """
  839. try:
  840. # 读取图像
  841. image = cv2.imread(image_path)
  842. if image is None:
  843. raise ValueError("无法读取图像文件")
  844. height, width = image.shape[:2]
  845. gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
  846. # 使用EAST文本检测器或其他方法
  847. # 这里使用简单的边缘检测和轮廓分析
  848. edges = cv2.Canny(gray, 50, 150, apertureSize=3)
  849. # 检测直线
  850. lines = cv2.HoughLines(edges, 1, np.pi/180, threshold=100)
  851. if lines is None:
  852. return {
  853. 'detected_angle': 0.0,
  854. 'confidence': 0.0,
  855. 'method': 'opencv_analysis',
  856. 'message': '未检测到足够的直线特征'
  857. }
  858. # 分析直线角度
  859. angles = []
  860. for rho, theta in lines[:, 0]:
  861. angle = theta * 180 / np.pi
  862. # 将角度标准化到0-180度
  863. if angle > 90:
  864. angle = angle - 180
  865. angles.append(angle)
  866. # 统计主要角度
  867. angle_hist = np.histogram(angles, bins=36, range=(-90, 90))[0]
  868. dominant_angle_idx = np.argmax(angle_hist)
  869. dominant_angle = -90 + dominant_angle_idx * 5 # 每个bin 5度
  870. # 将角度映射到标准旋转角度
  871. if -22.5 <= dominant_angle <= 22.5:
  872. detected_angle = 0.0
  873. elif 22.5 < dominant_angle <= 67.5:
  874. detected_angle = 270.0
  875. elif 67.5 < dominant_angle <= 90 or -90 <= dominant_angle < -67.5:
  876. detected_angle = 90.0
  877. else:
  878. detected_angle = 180.0
  879. confidence = angle_hist[dominant_angle_idx] / len(lines) if len(lines) > 0 else 0.0
  880. return {
  881. 'detected_angle': detected_angle,
  882. 'confidence': min(1.0, confidence),
  883. 'method': 'opencv_analysis',
  884. 'line_count': len(lines),
  885. 'dominant_angle': dominant_angle,
  886. 'message': f'基于{len(lines)}条直线检测到旋转角度: {detected_angle}°'
  887. }
  888. except Exception as e:
  889. return {
  890. 'detected_angle': 0.0,
  891. 'confidence': 0.0,
  892. 'method': 'opencv_analysis',
  893. 'error': str(e),
  894. 'message': f'OpenCV检测过程中发生错误: {str(e)}'
  895. }
  896. # ocr_validator_utils.py
  897. def find_available_ocr_files_multi_source(config: Dict) -> Dict[str, List[Dict]]:
  898. """查找多个数据源的OCR文件"""
  899. all_sources = {}
  900. for source in config.get('data_sources', []):
  901. source_name = source['name']
  902. ocr_tool = source['ocr_tool']
  903. source_key = f"{source_name}_{ocr_tool}" # 创建唯一标识
  904. ocr_out_dir = source['ocr_out_dir']
  905. if Path(ocr_out_dir).exists():
  906. files = find_available_ocr_files(ocr_out_dir)
  907. # 为每个文件添加数据源信息
  908. for file_info in files:
  909. file_info.update({
  910. 'source_name': source_name,
  911. 'ocr_tool': ocr_tool,
  912. 'description': source.get('description', ''),
  913. 'src_img_dir': source.get('src_img_dir', ''),
  914. 'ocr_out_dir': ocr_out_dir
  915. })
  916. all_sources[source_key] = {
  917. 'files': files,
  918. 'config': source
  919. }
  920. print(f"📁 找到数据源: {source_key} - {len(files)} 个文件")
  921. return all_sources
  922. def get_data_source_display_name(source_config: Dict) -> str:
  923. """生成数据源的显示名称"""
  924. name = source_config['name']
  925. tool = source_config['ocr_tool']
  926. description = source_config.get('description', '')
  927. # 获取工具的友好名称
  928. tool_name_map = {
  929. 'dots_ocr': 'Dots OCR',
  930. 'ppstructv3': 'PPStructV3'
  931. }
  932. tool_display = tool_name_map.get(tool, tool)
  933. return f"{name} ({tool_display})"
  934. def get_nested_value(data: Dict, path: str, default=None):
  935. if not path:
  936. return default
  937. keys = path.split('.')
  938. value = data
  939. for key in keys:
  940. if isinstance(value, dict) and key in value:
  941. value = value[key]
  942. else:
  943. return default
  944. return value