output_formatter.py 28 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770
  1. """
  2. 输出格式化器 - 将处理结果转换为多种格式输出
  3. 严格复用MinerU的输出格式,确保完全兼容
  4. """
  5. import json
  6. import os
  7. import sys
  8. from pathlib import Path
  9. from typing import Dict, Any, List, Union
  10. from loguru import logger
  11. import numpy as np
  12. from PIL import Image, ImageDraw, ImageFont
  13. # 导入MinerU的中间格式转换模块
  14. mineru_path = Path(__file__).parents[3]
  15. if str(mineru_path) not in sys.path:
  16. sys.path.insert(0, str(mineru_path))
  17. from mineru.backend.vlm.vlm_middle_json_mkcontent import union_make as vlm_union_make
  18. from mineru.utils.enum_class import MakeMode, BlockType, ContentType
  19. class OutputFormatter:
  20. """输出格式化器 - 严格按照MinerU格式"""
  21. def __init__(self, output_dir: str):
  22. self.output_dir = Path(output_dir)
  23. self.output_dir.mkdir(parents=True, exist_ok=True)
  24. # 颜色映射(与MinerU保持一致)
  25. self.color_map = {
  26. BlockType.TITLE: (102, 102, 255), # 蓝色
  27. BlockType.TEXT: (153, 0, 76), # 深红
  28. BlockType.IMAGE: (153, 255, 51), # 绿色
  29. BlockType.IMAGE_BODY: (153, 255, 51),
  30. BlockType.IMAGE_CAPTION: (102, 178, 255),
  31. BlockType.IMAGE_FOOTNOTE: (255, 178, 102),
  32. BlockType.TABLE: (204, 204, 0), # 黄色
  33. BlockType.TABLE_BODY: (204, 204, 0),
  34. BlockType.TABLE_CAPTION: (255, 255, 102),
  35. BlockType.TABLE_FOOTNOTE: (229, 255, 204),
  36. BlockType.INTERLINE_EQUATION: (0, 255, 0), # 亮绿
  37. BlockType.LIST: (40, 169, 92),
  38. BlockType.CODE: (102, 0, 204), # 紫色
  39. BlockType.CODE_BODY: (102, 0, 204),
  40. BlockType.CODE_CAPTION: (204, 153, 255),
  41. }
  42. def save_results(
  43. self,
  44. results: Dict[str, Any],
  45. output_config: Dict[str, Any]
  46. ) -> Dict[str, str]:
  47. """
  48. 保存处理结果为多种格式
  49. Args:
  50. results: 处理结果字典(包含pages列表,每页有processed_image)
  51. output_config: 输出配置
  52. Returns:
  53. 各种格式的输出文件路径字典
  54. """
  55. output_paths = {}
  56. # 创建文档特定的输出目录
  57. doc_name = Path(results['document_path']).stem
  58. doc_output_dir = self.output_dir / doc_name
  59. doc_output_dir.mkdir(parents=True, exist_ok=True)
  60. # 1. 转换为MinerU标准的middle.json格式
  61. middle_json = self._convert_to_middle_json(results)
  62. # 2. 保存middle.json
  63. if output_config.get('save_json', True):
  64. middle_json_path = doc_output_dir / f"{doc_name}_middle.json"
  65. with open(middle_json_path, 'w', encoding='utf-8') as f:
  66. json.dump(middle_json, f, ensure_ascii=False, indent=2)
  67. output_paths['middle_json'] = str(middle_json_path)
  68. logger.info(f"📄 Middle JSON saved: {middle_json_path}")
  69. # 3. 使用vlm_union_make生成content_list.json
  70. if output_config.get('save_content_list', True):
  71. content_list_path = self._save_content_list(
  72. middle_json, doc_output_dir, doc_name
  73. )
  74. output_paths['content_list'] = str(content_list_path)
  75. # 4. 生成Markdown
  76. if output_config.get('save_markdown', True):
  77. md_path = self._save_markdown(middle_json, doc_output_dir, doc_name)
  78. output_paths['markdown'] = str(md_path)
  79. # 5. 保存表格HTML(每个表格一个文件)
  80. if output_config.get('save_table_html', True):
  81. table_html_dir = self._save_table_htmls(
  82. middle_json, doc_output_dir, doc_name
  83. )
  84. output_paths['table_htmls'] = str(table_html_dir)
  85. # 6. 绘制布局图片
  86. if output_config.get('save_layout_image', False):
  87. layout_image_paths = self._save_layout_image(
  88. middle_json=middle_json,
  89. results=results,
  90. output_dir=doc_output_dir,
  91. doc_name=doc_name,
  92. draw_type_label=output_config.get('draw_type_label', True),
  93. draw_bbox_number=output_config.get('draw_bbox_number', True)
  94. )
  95. output_paths['layout_images'] = layout_image_paths
  96. logger.info(f"✅ Results saved to: {doc_output_dir}")
  97. return output_paths
  98. def _convert_to_middle_json(self, results: Dict[str, Any]) -> Dict[str, Any]:
  99. """
  100. 转换为MinerU标准的middle.json格式
  101. 严格按照 docs/zh/reference/output_files.md 中的VLM后端格式
  102. """
  103. middle_json = {
  104. "pdf_info": [],
  105. "_backend": "vlm", # 标记为VLM后端
  106. "_scene": results.get('scene', 'unknown'),
  107. "_version_name": "2.5.0"
  108. }
  109. for page in results['pages']:
  110. page_info = {
  111. 'page_idx': page['page_idx'],
  112. 'page_size': list(page.get('image_shape', [0, 0])[:2][::-1]), # [width, height]
  113. 'angle': page.get('angle', 0),
  114. 'para_blocks': [],
  115. 'discarded_blocks': []
  116. }
  117. # 转换每个元素为MinerU格式的block
  118. for element in page['elements']:
  119. block = self._element_to_mineru_block(element, page_info['page_size'])
  120. if block:
  121. # 根据类型分类到para_blocks或discarded_blocks
  122. if element.get('type') in ['header', 'footer', 'page_number',
  123. 'aside_text', 'page_footnote']:
  124. page_info['discarded_blocks'].append(block)
  125. else:
  126. page_info['para_blocks'].append(block)
  127. middle_json['pdf_info'].append(page_info)
  128. return middle_json
  129. def _element_to_mineru_block(
  130. self,
  131. element: Dict[str, Any],
  132. page_size: List[int]
  133. ) -> Dict[str, Any]:
  134. """
  135. 将处理结果的元素转换为MinerU标准的block格式
  136. 参考: mineru/backend/vlm/vlm_middle_json_mkcontent.py
  137. """
  138. element_type = element.get('type', '')
  139. bbox = element.get('bbox', [0, 0, 0, 0])
  140. # 归一化bbox坐标到0-1范围
  141. # normalized_bbox = self._normalize_bbox(bbox, page_size)
  142. block = {
  143. 'type': element_type,
  144. 'bbox': bbox,
  145. 'angle': element.get('angle', 0), # VLM后端特有
  146. 'lines': []
  147. }
  148. # 文本类型(text, title, ref_text等)
  149. if element_type in [BlockType.TEXT, BlockType.TITLE, BlockType.REF_TEXT,
  150. BlockType.PHONETIC, BlockType.HEADER, BlockType.FOOTER,
  151. BlockType.PAGE_NUMBER, BlockType.ASIDE_TEXT, BlockType.PAGE_FOOTNOTE]:
  152. content = element.get('content', {})
  153. text = content.get('text', '') if isinstance(content, dict) else str(content)
  154. if text:
  155. block['lines'] = [{
  156. 'bbox': bbox,
  157. 'spans': [{
  158. 'bbox': bbox,
  159. 'type': ContentType.TEXT,
  160. 'content': text
  161. }]
  162. }]
  163. # 添加标题级别
  164. if element_type == BlockType.TITLE and 'level' in element:
  165. block['level'] = element['level']
  166. # 列表类型
  167. elif element_type == BlockType.LIST:
  168. block['sub_type'] = element.get('sub_type', 'text')
  169. block['blocks'] = []
  170. list_items = element.get('content', {}).get('list_items', [])
  171. for item_text in list_items:
  172. item_block = {
  173. 'type': BlockType.TEXT,
  174. 'bbox': bbox,
  175. 'angle': 0,
  176. 'lines': [{
  177. 'bbox': bbox,
  178. 'spans': [{
  179. 'bbox': bbox,
  180. 'type': ContentType.TEXT,
  181. 'content': item_text
  182. }]
  183. }]
  184. }
  185. block['blocks'].append(item_block)
  186. # 代码块类型
  187. elif element_type == BlockType.CODE:
  188. block['sub_type'] = element.get('sub_type', 'code')
  189. block['blocks'] = []
  190. code_content = element.get('content', {})
  191. # code_body
  192. code_body = code_content.get('code_body', '')
  193. if code_body:
  194. code_body_block = {
  195. 'type': BlockType.CODE_BODY,
  196. 'bbox': bbox,
  197. 'angle': 0,
  198. 'lines': [{
  199. 'bbox': bbox,
  200. 'spans': [{
  201. 'bbox': bbox,
  202. 'type': ContentType.TEXT,
  203. 'content': code_body
  204. }]
  205. }]
  206. }
  207. block['blocks'].append(code_body_block)
  208. # 添加语言标识
  209. if 'guess_lang' in element:
  210. block['guess_lang'] = element['guess_lang']
  211. # code_caption
  212. code_caption = code_content.get('code_caption', [])
  213. for caption_text in code_caption:
  214. caption_block = {
  215. 'type': BlockType.CODE_CAPTION,
  216. 'bbox': bbox,
  217. 'angle': 0,
  218. 'lines': [{
  219. 'bbox': bbox,
  220. 'spans': [{
  221. 'bbox': bbox,
  222. 'type': ContentType.TEXT,
  223. 'content': caption_text
  224. }]
  225. }]
  226. }
  227. block['blocks'].append(caption_block)
  228. # 行间公式
  229. elif element_type == BlockType.INTERLINE_EQUATION:
  230. formula_content = element.get('content', {})
  231. latex = formula_content.get('latex', '')
  232. block['lines'] = [{
  233. 'bbox': bbox,
  234. 'spans': [{
  235. 'bbox': bbox,
  236. 'type': ContentType.INTERLINE_EQUATION,
  237. 'content': latex
  238. }]
  239. }]
  240. # 图片
  241. elif element_type == BlockType.IMAGE:
  242. block['blocks'] = []
  243. image_content = element.get('content', {})
  244. # image_body
  245. img_path = image_content.get('img_path', '')
  246. if img_path:
  247. image_body_block = {
  248. 'type': BlockType.IMAGE_BODY,
  249. 'bbox': bbox,
  250. 'angle': 0,
  251. 'lines': [{
  252. 'bbox': bbox,
  253. 'spans': [{
  254. 'bbox': bbox,
  255. 'type': ContentType.IMAGE,
  256. 'image_path': img_path
  257. }]
  258. }]
  259. }
  260. block['blocks'].append(image_body_block)
  261. # image_caption
  262. for caption_text in image_content.get('image_caption', []):
  263. caption_block = {
  264. 'type': BlockType.IMAGE_CAPTION,
  265. 'bbox': bbox,
  266. 'angle': 0,
  267. 'lines': [{
  268. 'bbox': bbox,
  269. 'spans': [{
  270. 'bbox': bbox,
  271. 'type': ContentType.TEXT,
  272. 'content': caption_text
  273. }]
  274. }]
  275. }
  276. block['blocks'].append(caption_block)
  277. # image_footnote
  278. for footnote_text in image_content.get('image_footnote', []):
  279. footnote_block = {
  280. 'type': BlockType.IMAGE_FOOTNOTE,
  281. 'bbox': bbox,
  282. 'angle': 0,
  283. 'lines': [{
  284. 'bbox': bbox,
  285. 'spans': [{
  286. 'bbox': bbox,
  287. 'type': ContentType.TEXT,
  288. 'content': footnote_text
  289. }]
  290. }]
  291. }
  292. block['blocks'].append(footnote_block)
  293. # 表格
  294. elif element_type == BlockType.TABLE:
  295. block['blocks'] = []
  296. table_content = element.get('content', {})
  297. # table_body
  298. table_html = table_content.get('html', '')
  299. img_path = table_content.get('img_path', '')
  300. if table_html or img_path:
  301. table_body_block = {
  302. 'type': BlockType.TABLE_BODY,
  303. 'bbox': bbox,
  304. 'angle': 0,
  305. 'lines': [{
  306. 'bbox': bbox,
  307. 'spans': [{
  308. 'bbox': bbox,
  309. 'type': ContentType.TABLE,
  310. 'html': table_html,
  311. 'image_path': img_path
  312. }]
  313. }]
  314. }
  315. block['blocks'].append(table_body_block)
  316. # table_caption
  317. for caption_text in table_content.get('table_caption', []):
  318. caption_block = {
  319. 'type': BlockType.TABLE_CAPTION,
  320. 'bbox': bbox,
  321. 'angle': 0,
  322. 'lines': [{
  323. 'bbox': bbox,
  324. 'spans': [{
  325. 'bbox': bbox,
  326. 'type': ContentType.TEXT,
  327. 'content': caption_text
  328. }]
  329. }]
  330. }
  331. block['blocks'].append(caption_block)
  332. # table_footnote
  333. for footnote_text in table_content.get('table_footnote', []):
  334. footnote_block = {
  335. 'type': BlockType.TABLE_FOOTNOTE,
  336. 'bbox': bbox,
  337. 'angle': 0,
  338. 'lines': [{
  339. 'bbox': bbox,
  340. 'spans': [{
  341. 'bbox': bbox,
  342. 'type': ContentType.TEXT,
  343. 'content': footnote_text
  344. }]
  345. }]
  346. }
  347. block['blocks'].append(footnote_block)
  348. return block
  349. def _normalize_bbox(self, bbox: List[float], page_size: List[int]) -> List[float]:
  350. """
  351. 将bbox归一化到0-1范围
  352. Args:
  353. bbox: [x0, y0, x1, y1] 绝对坐标
  354. page_size: [width, height] 页面尺寸
  355. Returns:
  356. 归一化后的bbox
  357. """
  358. if not bbox or len(bbox) != 4:
  359. return [0.0, 0.0, 0.0, 0.0]
  360. page_width, page_height = page_size
  361. x0, y0, x1, y1 = bbox
  362. return [
  363. x0 / page_width if page_width > 0 else 0.0,
  364. y0 / page_height if page_height > 0 else 0.0,
  365. x1 / page_width if page_width > 0 else 0.0,
  366. y1 / page_height if page_height > 0 else 0.0
  367. ]
  368. def _save_content_list(
  369. self,
  370. middle_json: Dict[str, Any],
  371. output_dir: Path,
  372. doc_name: str
  373. ) -> Path:
  374. """
  375. 使用vlm_union_make生成content_list.json
  376. """
  377. content_list_path = output_dir / f"{doc_name}_content_list.json"
  378. try:
  379. # 直接调用MinerU的vlm_union_make函数
  380. content_list = vlm_union_make(
  381. middle_json['pdf_info'],
  382. make_mode=MakeMode.CONTENT_LIST,
  383. img_buket_path='images'
  384. )
  385. with open(content_list_path, 'w', encoding='utf-8') as f:
  386. json.dump(content_list, f, ensure_ascii=False, indent=2)
  387. logger.info(f"📋 Content list saved: {content_list_path}")
  388. except Exception as e:
  389. logger.error(f"❌ Failed to generate content_list: {e}")
  390. # Fallback: 保存空列表
  391. with open(content_list_path, 'w', encoding='utf-8') as f:
  392. json.dump([], f)
  393. return content_list_path
  394. def _save_markdown(
  395. self,
  396. middle_json: Dict[str, Any],
  397. output_dir: Path,
  398. doc_name: str
  399. ) -> Path:
  400. """
  401. 使用vlm_union_make生成markdown
  402. """
  403. md_path = output_dir / f"{doc_name}.md"
  404. try:
  405. # 创建images目录
  406. images_dir = output_dir / 'images'
  407. images_dir.mkdir(exist_ok=True)
  408. # 调用MinerU的vlm_union_make生成markdown
  409. markdown_content = vlm_union_make(
  410. middle_json['pdf_info'],
  411. make_mode=MakeMode.MM_MD,
  412. img_buket_path='images'
  413. )
  414. # 添加元信息头部
  415. metadata = f"""---
  416. scene: {middle_json.get('_scene', 'unknown')}
  417. backend: {middle_json.get('_backend', 'vlm')}
  418. version: {middle_json.get('_version_name', '2.5.0')}
  419. ---
  420. """
  421. with open(md_path, 'w', encoding='utf-8') as f:
  422. f.write(metadata)
  423. f.write(markdown_content)
  424. logger.info(f"📝 Markdown saved: {md_path}")
  425. except Exception as e:
  426. logger.error(f"❌ Failed to generate markdown: {e}")
  427. # Fallback
  428. with open(md_path, 'w', encoding='utf-8') as f:
  429. f.write(f"# {doc_name}\n\n*Markdown generation failed*\n")
  430. return md_path
  431. def _save_table_htmls(
  432. self,
  433. middle_json: Dict[str, Any],
  434. output_dir: Path,
  435. doc_name: str
  436. ) -> Path:
  437. """
  438. 保存每个表格为单独的HTML文件
  439. """
  440. tables_dir = output_dir / 'tables'
  441. tables_dir.mkdir(exist_ok=True)
  442. table_count = 0
  443. for page_idx, page_info in enumerate(middle_json['pdf_info']):
  444. for block in page_info.get('para_blocks', []):
  445. if block.get('type') == BlockType.TABLE:
  446. # 提取表格HTML
  447. for sub_block in block.get('blocks', []):
  448. if sub_block.get('type') == BlockType.TABLE_BODY:
  449. for line in sub_block.get('lines', []):
  450. for span in line.get('spans', []):
  451. html_content = span.get('html', '')
  452. if html_content:
  453. # 保存表格HTML
  454. table_count += 1
  455. table_path = tables_dir / f"{doc_name}_table_{table_count}_page_{page_idx}.html"
  456. # 生成完整的HTML文档
  457. full_html = self._wrap_table_html(
  458. html_content,
  459. f"{doc_name} - Table {table_count}",
  460. page_idx
  461. )
  462. with open(table_path, 'w', encoding='utf-8') as f:
  463. f.write(full_html)
  464. logger.info(f"📊 Table {table_count} saved: {table_path}")
  465. if table_count > 0:
  466. logger.info(f"📊 Total {table_count} tables saved to: {tables_dir}")
  467. return tables_dir
  468. def _wrap_table_html(self, table_html: str, title: str, page_idx: int) -> str:
  469. """为表格HTML添加完整的HTML文档结构"""
  470. return f"""<!DOCTYPE html>
  471. <html lang="zh-CN">
  472. <head>
  473. <meta charset="UTF-8">
  474. <meta name="viewport" content="width=device-width, initial-scale=1.0">
  475. <title>{title}</title>
  476. <style>
  477. body {{
  478. font-family: Arial, "Microsoft YaHei", sans-serif;
  479. margin: 20px;
  480. background-color: #f5f5f5;
  481. }}
  482. .container {{
  483. max-width: 1200px;
  484. margin: 0 auto;
  485. background-color: white;
  486. padding: 20px;
  487. box-shadow: 0 0 10px rgba(0,0,0,0.1);
  488. }}
  489. .meta {{
  490. color: #666;
  491. font-size: 0.9em;
  492. margin-bottom: 20px;
  493. padding-bottom: 10px;
  494. border-bottom: 1px solid #ddd;
  495. }}
  496. table {{
  497. border-collapse: collapse;
  498. width: 100%;
  499. margin: 20px 0;
  500. }}
  501. th, td {{
  502. border: 1px solid #ddd;
  503. padding: 8px 12px;
  504. text-align: left;
  505. }}
  506. th {{
  507. background-color: #f2f2f2;
  508. font-weight: bold;
  509. }}
  510. tr:hover {{
  511. background-color: #f9f9f9;
  512. }}
  513. </style>
  514. </head>
  515. <body>
  516. <div class="container">
  517. <div class="meta">
  518. <p><strong>Title:</strong> {title}</p>
  519. <p><strong>Page:</strong> {page_idx + 1}</p>
  520. </div>
  521. {table_html}
  522. </div>
  523. </body>
  524. </html>"""
  525. def _save_layout_image(
  526. self,
  527. middle_json: Dict[str, Any],
  528. results: Dict[str, Any],
  529. output_dir: Path,
  530. doc_name: str,
  531. draw_type_label: bool = True,
  532. draw_bbox_number: bool = True
  533. ) -> List[Path]:
  534. """
  535. 在原始图片上绘制布局检测结果
  536. Args:
  537. middle_json: MinerU中间JSON
  538. results: 处理结果, processed_image字段包含预处理后的图像
  539. output_dir: 输出目录
  540. doc_name: 文档名称
  541. draw_type_label: 是否标注类型
  542. draw_bbox_number: 是否标注序号
  543. """
  544. layout_image_paths = []
  545. # 获取所有页面
  546. pages = results.get('pages', [])
  547. pdf_info = middle_json.get('pdf_info', [])
  548. if len(pages) == 0:
  549. logger.warning("⚠️ No pages found in results")
  550. return [output_dir]
  551. logger.info(f"🖼️ Generating layout images for {len(pages)} page(s)...")
  552. # 处理每一页
  553. for page_idx, (page, page_info) in enumerate(zip(pages, pdf_info)):
  554. original_image = page.get('processed_image')
  555. if original_image is None:
  556. logger.warning(f"⚠️ No processed_image found for page {page_idx}, skipping layout image.")
  557. continue
  558. layout_image_path = output_dir / f"{doc_name}_{page_idx + 1}_layout.png"
  559. # 读取图片
  560. if isinstance(original_image, str):
  561. image = Image.open(original_image).convert('RGB')
  562. elif isinstance(original_image, np.ndarray):
  563. image = Image.fromarray(original_image).convert('RGB')
  564. elif isinstance(original_image, Image.Image):
  565. image = original_image.convert('RGB')
  566. else:
  567. logger.error("Invalid image type")
  568. return layout_image_path
  569. # 创建绘图对象
  570. draw = ImageDraw.Draw(image, 'RGBA')
  571. # 加载字体
  572. try:
  573. font = ImageFont.truetype("/System/Library/Fonts/Helvetica.ttc", 14)
  574. except:
  575. try:
  576. font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf", 14)
  577. except:
  578. font = ImageFont.load_default()
  579. # 假设只处理第一页
  580. page_size = page_info.get('page_size', [image.width, image.height])
  581. image_width, image_height = image.size
  582. # 绘制所有blocks
  583. block_idx = 1
  584. for block in page_info.get('para_blocks', []) + page_info.get('discarded_blocks', []):
  585. block_type = block.get('type', '')
  586. bbox_original = block.get('bbox', [0, 0, 0, 0])
  587. x0 = int(bbox_original[0])
  588. y0 = int(bbox_original[1])
  589. x1 = int(bbox_original[2])
  590. y1 = int(bbox_original[3])
  591. # 获取颜色
  592. color = self.color_map.get(block_type, (255, 0, 0))
  593. # 绘制半透明填充
  594. overlay = Image.new('RGBA', image.size, (255, 255, 255, 0))
  595. overlay_draw = ImageDraw.Draw(overlay)
  596. overlay_draw.rectangle(
  597. [x0, y0, x1, y1],
  598. fill=(*color, 76), # 30% 透明度
  599. outline=color,
  600. width=2
  601. )
  602. image.paste(Image.alpha_composite(image.convert('RGBA'), overlay).convert('RGB'))
  603. draw = ImageDraw.Draw(image)
  604. # 绘制边框
  605. draw.rectangle([x0, y0, x1, y1], outline=color, width=2)
  606. # 标注类型
  607. if draw_type_label:
  608. label = block_type.replace('_', ' ').title()
  609. bbox_label = draw.textbbox((x0 + 2, y0 + 2), label, font=font)
  610. draw.rectangle(bbox_label, fill=color)
  611. draw.text((x0 + 2, y0 + 2), label, fill='white', font=font)
  612. # 标注序号
  613. if draw_bbox_number:
  614. number_text = str(block_idx)
  615. bbox_number = draw.textbbox((x1 - 25, y0 + 2), number_text, font=font)
  616. draw.rectangle(bbox_number, fill=(255, 0, 0))
  617. draw.text((x1 - 25, y0 + 2), number_text, fill='white', font=font)
  618. block_idx += 1
  619. # 保存图片
  620. image.save(layout_image_path)
  621. logger.info(f"🖼️ Layout image saved: {layout_image_path}")
  622. layout_image_paths.append(layout_image_path)
  623. return layout_image_paths
  624. if __name__ == "__main__":
  625. # 测试代码
  626. sample_results = {
  627. "document_path": "/path/to/sample.pdf",
  628. "scene": "financial_report",
  629. "pages": [
  630. {
  631. "page_idx": 0,
  632. "image_shape": [1654, 2338, 3],
  633. "elements": [
  634. {
  635. "type": "title",
  636. "bbox": [100, 50, 800, 100],
  637. "content": {"text": "财务报告"},
  638. "confidence": 0.98,
  639. "level": 1
  640. },
  641. {
  642. "type": "table",
  643. "bbox": [100, 200, 800, 600],
  644. "content": {
  645. "html": "<table><tr><td>项目</td><td>金额</td></tr></table>",
  646. "markdown": "| 项目 | 金额 |\n|------|------|",
  647. "table_caption": ["表1: 财务数据"],
  648. "table_footnote": []
  649. },
  650. "confidence": 0.95
  651. }
  652. ]
  653. }
  654. ]
  655. }
  656. formatter = OutputFormatter("./test_output")
  657. output_files = formatter.save_results(
  658. sample_results,
  659. {
  660. "save_json": True,
  661. "save_content_list": True,
  662. "save_markdown": True,
  663. "save_table_html": True,
  664. "save_layout_image": False
  665. }
  666. )
  667. print("Generated files:", output_files)