zsh
/
finrep-report


			
							12345678910111213141516171819202122232425262728293031323334353637
							"""一次性脚本：从 MVP 需求 docx 提取纯文本到 _extract_mvp_requirements_utf8.txt"""
from __future__ import annotations

import sys
import zipfile
import xml.etree.ElementTree as ET
from pathlib import Path

W = "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}"


def main() -> None:
    docx = Path(__file__).with_name("财顾报告智能化生成产品MVP版本需求文档.docx")
    if not docx.is_file():
        print("missing", docx, file=sys.stderr)
        sys.exit(1)
    root = ET.fromstring(zipfile.ZipFile(docx).read("word/document.xml"))
    parts: list[str] = []
    for p in root.iter(f"{W}p"):
        texts: list[str] = []
        for node in p.iter():
            tag = node.tag
            if tag == f"{W}t":
                if node.text:
                    texts.append(node.text)
                if node.tail:
                    texts.append(node.tail)
        line = "".join(texts).strip()
        if line:
            parts.append(line)
    out = Path(__file__).with_name("_extract_mvp_requirements_utf8.txt")
    out.write_text("\n".join(parts), encoding="utf-8")
    print(out, "lines", len(parts))


if __name__ == "__main__":
    main()