| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657 |
- from __future__ import annotations
- import uuid
- from dataclasses import dataclass
- @dataclass
- class ExtractedFile:
- doc_id: str
- source_label: str
- text: str
- warning: str | None = None
- def _safe_decode(data: bytes) -> str:
- for enc in ("utf-8", "utf-8-sig", "gbk"):
- try:
- return data.decode(enc)
- except UnicodeDecodeError:
- continue
- return data.decode("utf-8", errors="replace")
- def extract_text_from_upload(*, filename: str, data: bytes) -> ExtractedFile:
- name = (filename or "upload").strip() or "upload"
- lower = name.lower()
- doc_id = f"file-{uuid.uuid4().hex[:12]}"
- if lower.endswith(".pdf"):
- try:
- from io import BytesIO
- from pypdf import PdfReader
- reader = PdfReader(BytesIO(data))
- parts: list[str] = []
- for page in reader.pages:
- t = page.extract_text()
- if t:
- parts.append(t)
- text = "\n".join(parts).strip()
- warn = None
- if not text:
- warn = "PDF 未解析出文本(可能为扫描件,需先 OCR)"
- return ExtractedFile(
- doc_id=doc_id, source_label=name, text=text or "", warning=warn
- )
- except Exception as e: # noqa: BLE001
- return ExtractedFile(
- doc_id=doc_id,
- source_label=name,
- text="",
- warning=f"PDF 解析失败: {e}",
- )
- text = _safe_decode(data).strip()
- return ExtractedFile(doc_id=doc_id, source_label=name, text=text)
|