from __future__ import annotations import uuid from dataclasses import dataclass @dataclass class ExtractedFile: doc_id: str source_label: str text: str warning: str | None = None def _safe_decode(data: bytes) -> str: for enc in ("utf-8", "utf-8-sig", "gbk"): try: return data.decode(enc) except UnicodeDecodeError: continue return data.decode("utf-8", errors="replace") def extract_text_from_upload(*, filename: str, data: bytes) -> ExtractedFile: name = (filename or "upload").strip() or "upload" lower = name.lower() doc_id = f"file-{uuid.uuid4().hex[:12]}" if lower.endswith(".pdf"): try: from io import BytesIO from pypdf import PdfReader reader = PdfReader(BytesIO(data)) parts: list[str] = [] for page in reader.pages: t = page.extract_text() if t: parts.append(t) text = "\n".join(parts).strip() warn = None if not text: warn = "PDF 未解析出文本(可能为扫描件,需先 OCR)" return ExtractedFile( doc_id=doc_id, source_label=name, text=text or "", warning=warn ) except Exception as e: # noqa: BLE001 return ExtractedFile( doc_id=doc_id, source_label=name, text="", warning=f"PDF 解析失败: {e}", ) text = _safe_decode(data).strip() return ExtractedFile(doc_id=doc_id, source_label=name, text=text)