html_generator.py 5.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187
  1. """
  2. HTML 生成器模块
  3. 提供 HTML 输出功能:
  4. - 表格 HTML 生成(带样式)
  5. - 单元格坐标展示
  6. """
  7. import json
  8. from pathlib import Path
  9. from typing import Dict, Any, List
  10. from loguru import logger
  11. class HTMLGenerator:
  12. """HTML 生成器类"""
  13. @staticmethod
  14. def save_table_htmls(
  15. results: Dict[str, Any],
  16. output_dir: Path,
  17. doc_name: str
  18. ) -> Path:
  19. """
  20. 保存表格 HTML 文件
  21. Args:
  22. results: 处理结果
  23. output_dir: 输出目录
  24. doc_name: 文档名称
  25. Returns:
  26. 表格目录路径
  27. """
  28. tables_dir = output_dir / 'tables'
  29. tables_dir.mkdir(exist_ok=True)
  30. table_count = 0
  31. for page in results.get('pages', []):
  32. page_idx = page.get('page_idx', 0)
  33. for element in page.get('elements', []):
  34. if element.get('type') in ['table', 'table_body']:
  35. table_count += 1
  36. content = element.get('content', {})
  37. html = content.get('html', '')
  38. cells = content.get('cells', [])
  39. if html:
  40. full_html = HTMLGenerator._generate_table_html_with_styles(
  41. html, cells, doc_name, page_idx, table_count
  42. )
  43. html_path = tables_dir / f"{doc_name}_table_{table_count}_page_{page_idx + 1}.html"
  44. with open(html_path, 'w', encoding='utf-8') as f:
  45. f.write(full_html)
  46. if table_count > 0:
  47. logger.info(f"📊 {table_count} tables saved to: {tables_dir}")
  48. return tables_dir
  49. @staticmethod
  50. def _generate_table_html_with_styles(
  51. table_html: str,
  52. cells: List[Dict],
  53. doc_name: str,
  54. page_idx: int,
  55. table_idx: int
  56. ) -> str:
  57. """
  58. 生成带样式的完整 HTML
  59. Args:
  60. table_html: 表格 HTML 内容
  61. cells: 单元格列表
  62. doc_name: 文档名称
  63. page_idx: 页码
  64. table_idx: 表格序号
  65. Returns:
  66. 完整的 HTML 字符串
  67. """
  68. cells_json = json.dumps(cells, ensure_ascii=False, indent=2) if cells else "[]"
  69. return f"""<!DOCTYPE html>
  70. <html lang="zh-CN">
  71. <head>
  72. <meta charset="UTF-8">
  73. <meta name="viewport" content="width=device-width, initial-scale=1.0">
  74. <title>{doc_name} - Table {table_idx}</title>
  75. <style>
  76. body {{
  77. font-family: Arial, "Microsoft YaHei", sans-serif;
  78. margin: 20px;
  79. background-color: #f5f5f5;
  80. }}
  81. .container {{
  82. max-width: 1400px;
  83. margin: 0 auto;
  84. background-color: white;
  85. padding: 20px;
  86. box-shadow: 0 0 10px rgba(0,0,0,0.1);
  87. border-radius: 8px;
  88. }}
  89. .meta {{
  90. color: #666;
  91. font-size: 0.9em;
  92. margin-bottom: 20px;
  93. padding-bottom: 10px;
  94. border-bottom: 1px solid #ddd;
  95. }}
  96. table {{
  97. border-collapse: collapse;
  98. width: 100%;
  99. margin: 20px 0;
  100. }}
  101. th, td {{
  102. border: 1px solid #ddd;
  103. padding: 8px 12px;
  104. text-align: left;
  105. }}
  106. th {{
  107. background-color: #f2f2f2;
  108. font-weight: bold;
  109. }}
  110. tr:hover {{
  111. background-color: #f9f9f9;
  112. }}
  113. td[data-bbox], th[data-bbox] {{
  114. position: relative;
  115. }}
  116. td[data-bbox]:hover::after, th[data-bbox]:hover::after {{
  117. content: attr(data-bbox);
  118. position: absolute;
  119. bottom: 100%;
  120. left: 0;
  121. background: #333;
  122. color: white;
  123. padding: 2px 6px;
  124. font-size: 10px;
  125. border-radius: 3px;
  126. white-space: nowrap;
  127. z-index: 100;
  128. }}
  129. .cells-info {{
  130. margin-top: 30px;
  131. padding: 15px;
  132. background-color: #f8f9fa;
  133. border-radius: 5px;
  134. }}
  135. .cells-info summary {{
  136. cursor: pointer;
  137. font-weight: bold;
  138. color: #333;
  139. }}
  140. .cells-info pre {{
  141. background-color: #2d2d2d;
  142. color: #f8f8f2;
  143. padding: 15px;
  144. border-radius: 5px;
  145. overflow-x: auto;
  146. font-size: 12px;
  147. }}
  148. </style>
  149. </head>
  150. <body>
  151. <div class="container">
  152. <div class="meta">
  153. <p><strong>Document:</strong> {doc_name}</p>
  154. <p><strong>Page:</strong> {page_idx + 1}</p>
  155. <p><strong>Table:</strong> {table_idx}</p>
  156. <p><strong>Cells with coordinates:</strong> {len(cells)}</p>
  157. </div>
  158. {table_html}
  159. <div class="cells-info">
  160. <details>
  161. <summary>📍 单元格坐标数据 (JSON)</summary>
  162. <pre>{cells_json}</pre>
  163. </details>
  164. </div>
  165. </div>
  166. </body>
  167. </html>"""