| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187 |
- """
- HTML 生成器模块
- 提供 HTML 输出功能:
- - 表格 HTML 生成(带样式)
- - 单元格坐标展示
- """
- import json
- from pathlib import Path
- from typing import Dict, Any, List
- from loguru import logger
- class HTMLGenerator:
- """HTML 生成器类"""
-
- @staticmethod
- def save_table_htmls(
- results: Dict[str, Any],
- output_dir: Path,
- doc_name: str
- ) -> Path:
- """
- 保存表格 HTML 文件
-
- Args:
- results: 处理结果
- output_dir: 输出目录
- doc_name: 文档名称
-
- Returns:
- 表格目录路径
- """
- tables_dir = output_dir / 'tables'
- tables_dir.mkdir(exist_ok=True)
-
- table_count = 0
-
- for page in results.get('pages', []):
- page_idx = page.get('page_idx', 0)
-
- for element in page.get('elements', []):
- if element.get('type') in ['table', 'table_body']:
- table_count += 1
- content = element.get('content', {})
- html = content.get('html', '')
- cells = content.get('cells', [])
-
- if html:
- full_html = HTMLGenerator._generate_table_html_with_styles(
- html, cells, doc_name, page_idx, table_count
- )
-
- html_path = tables_dir / f"{doc_name}_table_{table_count}_page_{page_idx + 1}.html"
- with open(html_path, 'w', encoding='utf-8') as f:
- f.write(full_html)
-
- if table_count > 0:
- logger.info(f"📊 {table_count} tables saved to: {tables_dir}")
-
- return tables_dir
-
- @staticmethod
- def _generate_table_html_with_styles(
- table_html: str,
- cells: List[Dict],
- doc_name: str,
- page_idx: int,
- table_idx: int
- ) -> str:
- """
- 生成带样式的完整 HTML
-
- Args:
- table_html: 表格 HTML 内容
- cells: 单元格列表
- doc_name: 文档名称
- page_idx: 页码
- table_idx: 表格序号
-
- Returns:
- 完整的 HTML 字符串
- """
- cells_json = json.dumps(cells, ensure_ascii=False, indent=2) if cells else "[]"
-
- return f"""<!DOCTYPE html>
- <html lang="zh-CN">
- <head>
- <meta charset="UTF-8">
- <meta name="viewport" content="width=device-width, initial-scale=1.0">
- <title>{doc_name} - Table {table_idx}</title>
- <style>
- body {{
- font-family: Arial, "Microsoft YaHei", sans-serif;
- margin: 20px;
- background-color: #f5f5f5;
- }}
- .container {{
- max-width: 1400px;
- margin: 0 auto;
- background-color: white;
- padding: 20px;
- box-shadow: 0 0 10px rgba(0,0,0,0.1);
- border-radius: 8px;
- }}
- .meta {{
- color: #666;
- font-size: 0.9em;
- margin-bottom: 20px;
- padding-bottom: 10px;
- border-bottom: 1px solid #ddd;
- }}
- table {{
- border-collapse: collapse;
- width: 100%;
- margin: 20px 0;
- }}
- th, td {{
- border: 1px solid #ddd;
- padding: 8px 12px;
- text-align: left;
- }}
- th {{
- background-color: #f2f2f2;
- font-weight: bold;
- }}
- tr:hover {{
- background-color: #f9f9f9;
- }}
- td[data-bbox], th[data-bbox] {{
- position: relative;
- }}
- td[data-bbox]:hover::after, th[data-bbox]:hover::after {{
- content: attr(data-bbox);
- position: absolute;
- bottom: 100%;
- left: 0;
- background: #333;
- color: white;
- padding: 2px 6px;
- font-size: 10px;
- border-radius: 3px;
- white-space: nowrap;
- z-index: 100;
- }}
- .cells-info {{
- margin-top: 30px;
- padding: 15px;
- background-color: #f8f9fa;
- border-radius: 5px;
- }}
- .cells-info summary {{
- cursor: pointer;
- font-weight: bold;
- color: #333;
- }}
- .cells-info pre {{
- background-color: #2d2d2d;
- color: #f8f8f2;
- padding: 15px;
- border-radius: 5px;
- overflow-x: auto;
- font-size: 12px;
- }}
- </style>
- </head>
- <body>
- <div class="container">
- <div class="meta">
- <p><strong>Document:</strong> {doc_name}</p>
- <p><strong>Page:</strong> {page_idx + 1}</p>
- <p><strong>Table:</strong> {table_idx}</p>
- <p><strong>Cells with coordinates:</strong> {len(cells)}</p>
- </div>
-
- {table_html}
-
- <div class="cells-info">
- <details>
- <summary>📍 单元格坐标数据 (JSON)</summary>
- <pre>{cells_json}</pre>
- </details>
- </div>
- </div>
- </body>
- </html>"""
|