| 12345678910111213141516171819202122232425262728293031323334353637 |
- """一次性脚本:从 MVP 需求 docx 提取纯文本到 _extract_mvp_requirements_utf8.txt"""
- from __future__ import annotations
- import sys
- import zipfile
- import xml.etree.ElementTree as ET
- from pathlib import Path
- W = "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}"
- def main() -> None:
- docx = Path(__file__).with_name("财顾报告智能化生成产品MVP版本需求文档.docx")
- if not docx.is_file():
- print("missing", docx, file=sys.stderr)
- sys.exit(1)
- root = ET.fromstring(zipfile.ZipFile(docx).read("word/document.xml"))
- parts: list[str] = []
- for p in root.iter(f"{W}p"):
- texts: list[str] = []
- for node in p.iter():
- tag = node.tag
- if tag == f"{W}t":
- if node.text:
- texts.append(node.text)
- if node.tail:
- texts.append(node.tail)
- line = "".join(texts).strip()
- if line:
- parts.append(line)
- out = Path(__file__).with_name("_extract_mvp_requirements_utf8.txt")
- out.write_text("\n".join(parts), encoding="utf-8")
- print(out, "lines", len(parts))
- if __name__ == "__main__":
- main()
|