#!/usr/bin/env python3 """ WMS schema dumper -- converts the upstream CWM database design xlsx into a checked-in markdown reference + machine-readable JSON dump. Source: CloudWM/02_Design/CWM云仓数据库设计说明书.xlsx (in the git.flytoex.net/AWS_Team/CloudWM repo, NOT vendored here -- the xlsx is a moving target and we do not want a stale copy under source control). Output (overwrites in-place under core/docs/wms-schema/): - schema.md Human-readable: one section per table, field tables - schema.json Machine-readable: {"tables": [{"name", "sheet", "title", "fields": [...]}, ...]} Future MCP / Go schema-query servers can load this directly without re-parsing xlsx. Usage: python3 dump.py [--xlsx PATH] [--out-dir PATH] Defaults: --xlsx /tmp/CloudWM/02_Design/CWM云仓数据库设计说明书.xlsx --out-dir core/docs/wms-schema (relative to repo root) Why Python (not Go): openpyxl is the standard xlsx reader; Go xlsx libs are weaker and add dep bloat to a one-shot ETL tool. The output artifacts (md + JSON) are what programs consume; this script is a pre-build step, not part of the runtime. This script is intentionally kept simple (no class hierarchy / no lib structure) -- when WMS schema bumps a column type or adds a table, we re-run this once and check in the new artifacts. Future edits should keep that velocity in mind. """ import argparse import json import os import sys from pathlib import Path try: import openpyxl except ImportError: print("error: openpyxl not installed. pip install openpyxl", file=sys.stderr) sys.exit(2) # Column layout in every CWM* sheet (verified against # CWM云仓数据库设计说明书.xlsx 2026-04-28 snapshot): # col 0: Key (PK / FK / blank) # col 1: Field Name # col 2: Data Type # col 3: Required (NOT NULL / NULL) # col 4: Default # col 5: Remark <- enum tables (CostType=0=Standard,1=Adjusted) live here # col 6: Need Index (Y / blank) # col 7: Title (Chinese display name) # col 8: Description COLS = ["key", "field", "type", "required", "default", "remark", "need_index", "title", "description"] def parse_workbook(xlsx_path): """ Walk every sheet looking for `Table` marker rows. The xlsx layout is: a row whose col0 == "Table" introduces a table; the next row is the column header (Key | Field Name | Data Type | ...); the rows after are field defs until the next `Table` marker or sheet end. Empty rows are skipped (some sheets have visual gaps). Returns a list of {"name", "sheet", "title", "fields": [...]} dicts in the order tables appear in the workbook. """ wb = openpyxl.load_workbook(xlsx_path, read_only=True, data_only=True) tables = [] for sheet_name in wb.sheetnames: ws = wb[sheet_name] current = None for row in ws.iter_rows(values_only=True): if not row or all(c is None for c in row): continue cells = [c for c in row] if cells[0] == "Table": if current is not None: tables.append(current) # row[1] is table name, row[2] or row[3] may carry the # Chinese title (placement is inconsistent in the xlsx # but cells 2 and 3 are the only candidates seen). name = (cells[1] or "").strip() if len(cells) > 1 else "" title = "" for slot in (cells[2:6] if len(cells) > 2 else []): if isinstance(slot, str) and slot.strip(): title = slot.strip() break current = { "name": name, "sheet": sheet_name, "title": title, "fields": [], } continue if current is None: continue # Skip the column-header row. if cells[0] == "Key" or (cells[1] == "Field Name"): continue field = {} for i, key in enumerate(COLS): if i < len(cells) and cells[i] is not None: val = cells[i] if not isinstance(val, str): val = str(val) field[key] = val.strip() else: field[key] = "" # Field rows must have a Field Name; otherwise the row is # cosmetic (formatting / merged cells / blank padding). if field.get("field"): current["fields"].append(field) if current is not None: tables.append(current) current = None return tables def dedupe_tables(tables): """ Some xlsx pages copy a table verbatim into multiple sheets (e.g. ShipCostCfg appears twice in CWMACCT). Keep first occurrence, drop later ones with identical name+field-count to avoid markdown duplicates. """ seen = {} out = [] for t in tables: key = (t["name"], len(t["fields"])) if key in seen: continue seen[key] = True out.append(t) return out def render_markdown(tables): """ Group tables by sheet, render one heading per sheet then one sub-heading per table with a markdown field table. Output is grep-friendly (Claude/dev can `grep -A 30 "## ShipCostCfgMaster"` and see the field table inline). """ by_sheet = {} for t in tables: by_sheet.setdefault(t["sheet"], []).append(t) out = [] out.append("# CloudWM 数据库设计参考 (auto-generated, do NOT hand-edit)\n") out.append("Source: `CloudWM/02_Design/CWM云仓数据库设计说明书.xlsx`") out.append("Generator: `core/tools/wms-schema/dump.py`") out.append("") out.append("Re-run when the upstream xlsx changes:") out.append("```") out.append("git -C /tmp/CloudWM pull") out.append("python3 core/tools/wms-schema/dump.py") out.append("```") out.append("") out.append(f"**{sum(len(v) for v in by_sheet.values())}** tables across " f"**{len(by_sheet)}** sheets.\n") out.append("## Table of contents\n") for sheet in by_sheet: out.append(f"- [{sheet}](#{sheet.lower()})") for t in by_sheet[sheet]: anchor = t["name"].lower().replace(".", "") out.append(f" - [{t['name']}](#{anchor})" + (f" — {t['title']}" if t["title"] else "")) out.append("") for sheet, ts in by_sheet.items(): out.append(f"\n## {sheet}\n") for t in ts: out.append(f"### {t['name']}") if t["title"]: out.append(f"\n_{t['title']}_\n") out.append("") out.append("| Key | Field | Type | Req | Default | Remark | Title | Description |") out.append("|---|---|---|---|---|---|---|---|") for f in t["fields"]: # markdown-escape pipe inside cell values cells = [f.get("key", ""), f.get("field", ""), f.get("type", ""), f.get("required", ""), f.get("default", ""), f.get("remark", ""), f.get("title", ""), f.get("description", "")] cells = [c.replace("|", "\\|").replace("\n", " ") for c in cells] out.append("| " + " | ".join(cells) + " |") out.append("") return "\n".join(out) def render_json(tables): """Stable JSON for future MCP / Go consumers.""" return json.dumps({"tables": tables}, ensure_ascii=False, indent=2, sort_keys=False) def main(): parser = argparse.ArgumentParser() parser.add_argument("--xlsx", default="/tmp/CloudWM/02_Design/" "CWM云仓数据库设计说明书.xlsx") parser.add_argument("--out-dir", default=None, help="defaults to /core/docs/wms-schema") args = parser.parse_args() if not os.path.exists(args.xlsx): print(f"error: xlsx not found: {args.xlsx}", file=sys.stderr) print(" hint: clone CloudWM repo first:", file=sys.stderr) print(" git clone https://@git.flytoex.net/AWS_Team/CloudWM.git " "/tmp/CloudWM", file=sys.stderr) sys.exit(2) if args.out_dir is None: # repo root = parent of core/ script_dir = Path(__file__).resolve().parent repo_root = script_dir.parent.parent.parent # tools/wms-schema -> tools -> core -> ccm args.out_dir = str(repo_root / "core" / "docs" / "wms-schema") os.makedirs(args.out_dir, exist_ok=True) print(f"reading {args.xlsx}", file=sys.stderr) tables = parse_workbook(args.xlsx) print(f" parsed {len(tables)} table sections (pre-dedupe)", file=sys.stderr) tables = dedupe_tables(tables) print(f" {len(tables)} unique tables", file=sys.stderr) md_path = os.path.join(args.out_dir, "schema.md") json_path = os.path.join(args.out_dir, "schema.json") md = render_markdown(tables) js = render_json(tables) with open(md_path, "w", encoding="utf-8") as f: f.write(md) with open(json_path, "w", encoding="utf-8") as f: f.write(js) print(f"wrote {md_path} ({len(md):,} chars)", file=sys.stderr) print(f"wrote {json_path} ({len(js):,} chars)", file=sys.stderr) if __name__ == "__main__": main()