"""一次性脚本:从 MVP 需求 docx 提取纯文本到 _extract_mvp_requirements_utf8.txt""" from __future__ import annotations import sys import zipfile import xml.etree.ElementTree as ET from pathlib import Path W = "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}" def main() -> None: docx = Path(__file__).with_name("财顾报告智能化生成产品MVP版本需求文档.docx") if not docx.is_file(): print("missing", docx, file=sys.stderr) sys.exit(1) root = ET.fromstring(zipfile.ZipFile(docx).read("word/document.xml")) parts: list[str] = [] for p in root.iter(f"{W}p"): texts: list[str] = [] for node in p.iter(): tag = node.tag if tag == f"{W}t": if node.text: texts.append(node.text) if node.tail: texts.append(node.tail) line = "".join(texts).strip() if line: parts.append(line) out = Path(__file__).with_name("_extract_mvp_requirements_utf8.txt") out.write_text("\n".join(parts), encoding="utf-8") print(out, "lines", len(parts)) if __name__ == "__main__": main()