""" HTML 生成器模块 提供 HTML 输出功能: - 表格 HTML 生成(带样式) - 单元格坐标展示 """ import json from pathlib import Path from typing import Dict, Any, List from loguru import logger class HTMLGenerator: """HTML 生成器类""" @staticmethod def save_table_htmls( results: Dict[str, Any], output_dir: Path, doc_name: str ) -> Path: """ 保存表格 HTML 文件 Args: results: 处理结果 output_dir: 输出目录 doc_name: 文档名称 Returns: 表格目录路径 """ tables_dir = output_dir / 'tables' tables_dir.mkdir(exist_ok=True) table_count = 0 for page in results.get('pages', []): page_idx = page.get('page_idx', 0) for element in page.get('elements', []): if element.get('type') in ['table', 'table_body']: table_count += 1 content = element.get('content', {}) html = content.get('html', '') cells = content.get('cells', []) if html: full_html = HTMLGenerator._generate_table_html_with_styles( html, cells, doc_name, page_idx, table_count ) html_path = tables_dir / f"{doc_name}_table_{table_count}_page_{page_idx + 1}.html" with open(html_path, 'w', encoding='utf-8') as f: f.write(full_html) if table_count > 0: logger.info(f"📊 {table_count} tables saved to: {tables_dir}") return tables_dir @staticmethod def _generate_table_html_with_styles( table_html: str, cells: List[Dict], doc_name: str, page_idx: int, table_idx: int ) -> str: """ 生成带样式的完整 HTML Args: table_html: 表格 HTML 内容 cells: 单元格列表 doc_name: 文档名称 page_idx: 页码 table_idx: 表格序号 Returns: 完整的 HTML 字符串 """ cells_json = json.dumps(cells, ensure_ascii=False, indent=2) if cells else "[]" return f"""
{cells_json}