#!/usr/bin/env python3
"""
WMS schema dumper -- converts the upstream CWM database design xlsx
into a checked-in markdown reference + machine-readable JSON dump.

Source: CloudWM/02_Design/CWM云仓数据库设计说明书.xlsx (in the
git.flytoex.net/AWS_Team/CloudWM repo, NOT vendored here -- the xlsx
is a moving target and we do not want a stale copy under source
control).

Output (overwrites in-place under core/docs/wms-schema/):
  - schema.md      Human-readable: one section per table, field tables
  - schema.json    Machine-readable: {"tables": [{"name", "sheet",
                                       "title", "fields": [...]}, ...]}
                   Future MCP / Go schema-query servers can load this
                   directly without re-parsing xlsx.

Usage:
  python3 dump.py [--xlsx PATH] [--out-dir PATH]

  Defaults: --xlsx /tmp/CloudWM/02_Design/CWM云仓数据库设计说明书.xlsx
            --out-dir core/docs/wms-schema (relative to repo root)

Why Python (not Go): openpyxl is the standard xlsx reader; Go xlsx
libs are weaker and add dep bloat to a one-shot ETL tool. The output
artifacts (md + JSON) are what programs consume; this script is a
pre-build step, not part of the runtime.

This script is intentionally kept simple (no class hierarchy / no
lib structure) -- when WMS schema bumps a column type or adds a
table, we re-run this once and check in the new artifacts. Future
edits should keep that velocity in mind.
"""

import argparse
import json
import os
import sys
from pathlib import Path

try:
    import openpyxl
except ImportError:
    print("error: openpyxl not installed. pip install openpyxl", file=sys.stderr)
    sys.exit(2)


# Column layout in every CWM* sheet (verified against
# CWM云仓数据库设计说明书.xlsx 2026-04-28 snapshot):
#   col 0: Key (PK / FK / blank)
#   col 1: Field Name
#   col 2: Data Type
#   col 3: Required (NOT NULL / NULL)
#   col 4: Default
#   col 5: Remark        <- enum tables (CostType=0=Standard,1=Adjusted) live here
#   col 6: Need Index    (Y / blank)
#   col 7: Title         (Chinese display name)
#   col 8: Description
COLS = ["key", "field", "type", "required", "default", "remark",
        "need_index", "title", "description"]


def parse_workbook(xlsx_path):
    """
    Walk every sheet looking for `Table` marker rows. The xlsx layout
    is: a row whose col0 == "Table" introduces a table; the next row
    is the column header (Key | Field Name | Data Type | ...); the
    rows after are field defs until the next `Table` marker or sheet
    end. Empty rows are skipped (some sheets have visual gaps).

    Returns a list of {"name", "sheet", "title", "fields": [...]} dicts
    in the order tables appear in the workbook.
    """
    wb = openpyxl.load_workbook(xlsx_path, read_only=True, data_only=True)
    tables = []
    for sheet_name in wb.sheetnames:
        ws = wb[sheet_name]
        current = None
        for row in ws.iter_rows(values_only=True):
            if not row or all(c is None for c in row):
                continue
            cells = [c for c in row]
            if cells[0] == "Table":
                if current is not None:
                    tables.append(current)
                # row[1] is table name, row[2] or row[3] may carry the
                # Chinese title (placement is inconsistent in the xlsx
                # but cells 2 and 3 are the only candidates seen).
                name = (cells[1] or "").strip() if len(cells) > 1 else ""
                title = ""
                for slot in (cells[2:6] if len(cells) > 2 else []):
                    if isinstance(slot, str) and slot.strip():
                        title = slot.strip()
                        break
                current = {
                    "name": name,
                    "sheet": sheet_name,
                    "title": title,
                    "fields": [],
                }
                continue
            if current is None:
                continue
            # Skip the column-header row.
            if cells[0] == "Key" or (cells[1] == "Field Name"):
                continue
            field = {}
            for i, key in enumerate(COLS):
                if i < len(cells) and cells[i] is not None:
                    val = cells[i]
                    if not isinstance(val, str):
                        val = str(val)
                    field[key] = val.strip()
                else:
                    field[key] = ""
            # Field rows must have a Field Name; otherwise the row is
            # cosmetic (formatting / merged cells / blank padding).
            if field.get("field"):
                current["fields"].append(field)
        if current is not None:
            tables.append(current)
            current = None
    return tables


def dedupe_tables(tables):
    """
    Some xlsx pages copy a table verbatim into multiple sheets (e.g.
    ShipCostCfg appears twice in CWMACCT). Keep first occurrence,
    drop later ones with identical name+field-count to avoid markdown
    duplicates.
    """
    seen = {}
    out = []
    for t in tables:
        key = (t["name"], len(t["fields"]))
        if key in seen:
            continue
        seen[key] = True
        out.append(t)
    return out


def render_markdown(tables):
    """
    Group tables by sheet, render one heading per sheet then one
    sub-heading per table with a markdown field table. Output is
    grep-friendly (Claude/dev can `grep -A 30 "## ShipCostCfgMaster"`
    and see the field table inline).
    """
    by_sheet = {}
    for t in tables:
        by_sheet.setdefault(t["sheet"], []).append(t)

    out = []
    out.append("# CloudWM 数据库设计参考 (auto-generated, do NOT hand-edit)\n")
    out.append("Source: `CloudWM/02_Design/CWM云仓数据库设计说明书.xlsx`")
    out.append("Generator: `core/tools/wms-schema/dump.py`")
    out.append("")
    out.append("Re-run when the upstream xlsx changes:")
    out.append("```")
    out.append("git -C /tmp/CloudWM pull")
    out.append("python3 core/tools/wms-schema/dump.py")
    out.append("```")
    out.append("")
    out.append(f"**{sum(len(v) for v in by_sheet.values())}** tables across "
               f"**{len(by_sheet)}** sheets.\n")
    out.append("## Table of contents\n")
    for sheet in by_sheet:
        out.append(f"- [{sheet}](#{sheet.lower()})")
        for t in by_sheet[sheet]:
            anchor = t["name"].lower().replace(".", "")
            out.append(f"  - [{t['name']}](#{anchor})"
                       + (f" — {t['title']}" if t["title"] else ""))
    out.append("")

    for sheet, ts in by_sheet.items():
        out.append(f"\n## {sheet}\n")
        for t in ts:
            out.append(f"### {t['name']}")
            if t["title"]:
                out.append(f"\n_{t['title']}_\n")
            out.append("")
            out.append("| Key | Field | Type | Req | Default | Remark | Title | Description |")
            out.append("|---|---|---|---|---|---|---|---|")
            for f in t["fields"]:
                # markdown-escape pipe inside cell values
                cells = [f.get("key", ""), f.get("field", ""), f.get("type", ""),
                         f.get("required", ""), f.get("default", ""),
                         f.get("remark", ""), f.get("title", ""),
                         f.get("description", "")]
                cells = [c.replace("|", "\\|").replace("\n", " ") for c in cells]
                out.append("| " + " | ".join(cells) + " |")
            out.append("")
    return "\n".join(out)


def render_json(tables):
    """Stable JSON for future MCP / Go consumers."""
    return json.dumps({"tables": tables}, ensure_ascii=False, indent=2,
                      sort_keys=False)


def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--xlsx", default="/tmp/CloudWM/02_Design/"
                                          "CWM云仓数据库设计说明书.xlsx")
    parser.add_argument("--out-dir", default=None,
                        help="defaults to <repo-root>/core/docs/wms-schema")
    args = parser.parse_args()

    if not os.path.exists(args.xlsx):
        print(f"error: xlsx not found: {args.xlsx}", file=sys.stderr)
        print("  hint: clone CloudWM repo first:", file=sys.stderr)
        print("    git clone https://<token>@git.flytoex.net/AWS_Team/CloudWM.git "
              "/tmp/CloudWM", file=sys.stderr)
        sys.exit(2)

    if args.out_dir is None:
        # repo root = parent of core/
        script_dir = Path(__file__).resolve().parent
        repo_root = script_dir.parent.parent.parent  # tools/wms-schema -> tools -> core -> ccm
        args.out_dir = str(repo_root / "core" / "docs" / "wms-schema")
    os.makedirs(args.out_dir, exist_ok=True)

    print(f"reading {args.xlsx}", file=sys.stderr)
    tables = parse_workbook(args.xlsx)
    print(f"  parsed {len(tables)} table sections (pre-dedupe)", file=sys.stderr)
    tables = dedupe_tables(tables)
    print(f"  {len(tables)} unique tables", file=sys.stderr)

    md_path = os.path.join(args.out_dir, "schema.md")
    json_path = os.path.join(args.out_dir, "schema.json")

    md = render_markdown(tables)
    js = render_json(tables)

    with open(md_path, "w", encoding="utf-8") as f:
        f.write(md)
    with open(json_path, "w", encoding="utf-8") as f:
        f.write(js)

    print(f"wrote {md_path} ({len(md):,} chars)", file=sys.stderr)
    print(f"wrote {json_path} ({len(js):,} chars)", file=sys.stderr)


if __name__ == "__main__":
    main()