_extract_docx.py 1.2 KB

12345678910111213141516171819202122232425262728293031323334353637
  1. """一次性脚本:从 MVP 需求 docx 提取纯文本到 _extract_mvp_requirements_utf8.txt"""
  2. from __future__ import annotations
  3. import sys
  4. import zipfile
  5. import xml.etree.ElementTree as ET
  6. from pathlib import Path
  7. W = "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}"
  8. def main() -> None:
  9. docx = Path(__file__).with_name("财顾报告智能化生成产品MVP版本需求文档.docx")
  10. if not docx.is_file():
  11. print("missing", docx, file=sys.stderr)
  12. sys.exit(1)
  13. root = ET.fromstring(zipfile.ZipFile(docx).read("word/document.xml"))
  14. parts: list[str] = []
  15. for p in root.iter(f"{W}p"):
  16. texts: list[str] = []
  17. for node in p.iter():
  18. tag = node.tag
  19. if tag == f"{W}t":
  20. if node.text:
  21. texts.append(node.text)
  22. if node.tail:
  23. texts.append(node.tail)
  24. line = "".join(texts).strip()
  25. if line:
  26. parts.append(line)
  27. out = Path(__file__).with_name("_extract_mvp_requirements_utf8.txt")
  28. out.write_text("\n".join(parts), encoding="utf-8")
  29. print(out, "lines", len(parts))
  30. if __name__ == "__main__":
  31. main()