// schema-probe - JSON Schema 特性支持度跨模型对照探测工具. // // 通过 OpenRouter 对目标模型发送工具调用请求,验证各模型对 JSON Schema 特性的实际支持情况. // // 探测特性(10 项): // 1. enum 枚举值约束 // 2. anyOf / oneOf 联合类型 // 3. nested-object 嵌套对象(2 层) // 4. additionalProperties additionalProperties: false 严格模式 // 5. $ref / $defs JSON Schema 引用 // 6. deep-nesting 深层嵌套(5 层) // 7. long-description 超长 description(>500 字符) // 8. numeric-range minimum / maximum 数值约束 // 9. string-length minLength / maxLength 字符串约束 // 10. array-items-range minItems / maxItems 数组约束 // // 使用: // // source .env && go run ./cmd/schema-probe/ [--models model1,model2] package main import ( "bufio" "context" "encoding/json" "flag" "fmt" "os" "strings" "time" "git.flytoex.net/yuanwei/flyto-agent/pkg/flyto" "git.flytoex.net/yuanwei/flyto-agent/pkg/providers/openrouter" ) func main() { loadEnvFile(".env") modelsFlag := flag.String("models", "google/gemini-2.0-flash-001,deepseek/deepseek-r1", "逗号分隔的模型 ID 列表(通过 OpenRouter 访问)", ) flag.Parse() orKey := os.Getenv("OPENROUTER_API_KEY") if orKey == "" { fmt.Fprintln(os.Stderr, "未找到 OPENROUTER_API_KEY,请 source .env 或设置环境变量") os.Exit(1) } models := strings.Split(*modelsFlag, ",") for i, m := range models { models[i] = strings.TrimSpace(m) } provider := openrouter.New(openrouter.Config{ APIKey: orKey, AppName: "schema-probe", }) ctx := context.Background() fmt.Printf("=== JSON Schema 特性支持度对照探测 | %s ===\n\n", time.Now().Format("2006-01-02 15:04:05")) fmt.Printf("目标模型: %s\n\n", strings.Join(models, ", ")) // 先收集所有模型的结果 allResults := make([]modelResult, 0, len(models)) for _, model := range models { fmt.Printf("探测 %s ...\n", model) results := runAllCases(ctx, provider, model) allResults = append(allResults, modelResult{model: model, results: results}) } // 输出对照表 printComparisonTable(allResults) } type caseResult struct { name string ok bool note string } type modelResult struct { model string results []caseResult } // runAllCases 对单个模型跑全部 10 项 schema 特性测试. func runAllCases(ctx context.Context, p flyto.ModelProvider, model string) []caseResult { results := make([]caseResult, 0, len(schemaCases)) for _, tc := range schemaCases { ok, note := testSchemaCase(ctx, p, model, tc) results = append(results, caseResult{ name: tc.name, ok: ok, note: note, }) } return results } // schemaCase 是单个 JSON Schema 特性测试用例. type schemaCase struct { name string feature string // 用于表头 description string // 用于输出诊断 schema string // prompt 是用户消息,引导模型调用工具并填充合法 input. // 精妙之处(CLEVER): 针对每个用例定制 prompt,而非通用"call with any valid input"-- // DeepSeek R1 等推理模型对模糊指令响应不稳定,具体值提示显著提升工具调用率. prompt string } // schemaCases 是 10 项 JSON Schema 特性测试用例. // // 升华改进(ELEVATED): 覆盖范围超过 minimax-probe-- // 新增 minimum/maximum,minLength/maxLength,minItems/maxItems 三类约束验证, // 直接对应内置工具(Bash timeout range,path minLength,tags minItems)的真实使用场景. var schemaCases = []schemaCase{ { name: "enum", feature: "enum", description: "枚举值约束(status 必须是 active/inactive/pending 之一)", schema: `{ "type": "object", "properties": { "status": {"type": "string", "enum": ["active", "inactive", "pending"]}, "name": {"type": "string"} }, "required": ["status", "name"] }`, prompt: `Call probe_tool with status="active" and name="alice".`, }, { name: "anyOf-oneOf", feature: "anyOf / oneOf", description: "联合类型(value 可为 string 或 number)", schema: `{ "type": "object", "properties": { "label": {"type": "string"}, "value": { "oneOf": [ {"type": "string"}, {"type": "number"} ] }, "extra": { "anyOf": [ {"type": "boolean"}, {"type": "string"} ] } }, "required": ["label", "value"] }`, prompt: `Call probe_tool with label="test", value=42, extra=true.`, }, { name: "nested-object", feature: "nested object", description: "嵌套对象(user 包含 id 和 email)", schema: `{ "type": "object", "properties": { "user": { "type": "object", "properties": { "id": {"type": "string"}, "email": {"type": "string"} }, "required": ["id"] }, "action": {"type": "string"} }, "required": ["user", "action"] }`, prompt: `Call probe_tool with user={id:"u001",email:"alice@example.com"} and action="login".`, }, { name: "additionalProperties", feature: "additionalProperties:false", description: "严格模式:不允许额外字段", schema: `{ "type": "object", "properties": { "command": {"type": "string"}, "args": {"type": "array", "items": {"type": "string"}} }, "required": ["command"], "additionalProperties": false }`, prompt: `Call probe_tool with command="ls" and args=["-la","/tmp"].`, }, { name: "ref-defs", feature: "$ref / $defs", description: "$ref 引用 $defs 定义的 Address 类型", schema: `{ "type": "object", "$defs": { "Address": { "type": "object", "properties": { "street": {"type": "string"}, "city": {"type": "string"} }, "required": ["city"] } }, "properties": { "billing": {"$ref": "#/$defs/Address"}, "shipping": {"$ref": "#/$defs/Address"} }, "required": ["billing", "shipping"] }`, prompt: `Call probe_tool with billing={city:"New York",street:"5th Ave"} and shipping={city:"LA",street:"Sunset Blvd"}.`, }, { name: "deep-nesting", feature: "deep nesting (5层)", description: "5 层深度嵌套对象", schema: `{ "type": "object", "properties": { "level1": { "type": "object", "properties": { "level2": { "type": "object", "properties": { "level3": { "type": "object", "properties": { "level4": { "type": "object", "properties": { "level5": {"type": "string"} }, "required": ["level5"] } }, "required": ["level4"] } }, "required": ["level3"] } }, "required": ["level2"] } }, "required": ["level1"] }`, prompt: `Call probe_tool with level1={level2:{level3:{level4:{level5:"deep_value"}}}}.`, }, { name: "long-description", feature: "long description (>500字符)", description: "字段 description 超过 500 字符", schema: `{ "type": "object", "properties": { "command": { "type": "string", "description": "The bash command to execute. This can be any valid bash command including pipes, redirections, and multi-line scripts. The command runs in a sandboxed environment with restricted file system access. Avoid commands that require interactive input as they will timeout. Maximum execution time is 120 seconds. The working directory persists between calls unless explicitly changed with cd. Environment variables can be set and will persist within the session. You can use standard Unix utilities like grep, awk, sed, find, etc. Be careful with commands that may have side effects on the file system." }, "timeout": { "type": "number", "description": "Optional timeout in milliseconds. Defaults to 30000ms (30 seconds). Maximum is 600000ms (10 minutes). If the command exceeds this timeout it will be killed and an error returned. Set to 0 to use the default timeout." } }, "required": ["command"] }`, prompt: `Call probe_tool with command="echo hello" and timeout=5000.`, }, { name: "numeric-range", feature: "minimum / maximum", description: "数值范围约束(score: 0-100,priority: 1-5)", schema: `{ "type": "object", "properties": { "score": { "type": "number", "minimum": 0, "maximum": 100 }, "priority": { "type": "integer", "minimum": 1, "maximum": 5 }, "label": {"type": "string"} }, "required": ["score", "priority", "label"] }`, prompt: `Call probe_tool with score=85, priority=3, label="high".`, }, { name: "string-length", feature: "minLength / maxLength", description: "字符串长度约束(username: 3-20字符)", schema: `{ "type": "object", "properties": { "username": { "type": "string", "minLength": 3, "maxLength": 20 }, "bio": { "type": "string", "maxLength": 200 }, "code": { "type": "string", "minLength": 6, "maxLength": 6 } }, "required": ["username", "code"] }`, prompt: `Call probe_tool with username="alice_dev", code="ABC123", bio="A software developer".`, }, { name: "array-items-range", feature: "minItems / maxItems", description: "数组项数约束(tags: 1-5项,items: 2-10项)", schema: `{ "type": "object", "properties": { "tags": { "type": "array", "items": {"type": "string"}, "minItems": 1, "maxItems": 5 }, "items": { "type": "array", "items": { "type": "object", "properties": { "id": {"type": "string"}, "qty": {"type": "number"} } }, "minItems": 2, "maxItems": 10 } }, "required": ["tags", "items"] }`, prompt: `Call probe_tool with tags=["go","schema","test"] and items=[{id:"a",qty:1},{id:"b",qty:2}].`, }, } // testSchemaCase 测试单个 schema 用例: // 1. 发送含该 schema 的工具定义 // 2. 要求模型调用该工具 // 3. 验证收到的 tool_use event(API 层接受即视为通过;input 内容作为诊断信息) // // 精妙之处(CLEVER): 判定标准是"API 是否接受该 schema 并返回 tool_use",而非"模型是否遵守约束值". // 模型不遵守 minimum=0 不代表 API 拒绝该 schema--我们测的是 schema 特性兼容性,不是模型对齐质量. // 若 API 因 schema 格式问题返回 400,或模型完全不触发工具调用,则视为不支持. func testSchemaCase(ctx context.Context, p flyto.ModelProvider, model string, tc schemaCase) (bool, string) { reqCtx, cancel := context.WithTimeout(ctx, 60*time.Second) defer cancel() // 压缩 schema(去除多余空白,确保 JSON 合法) var schemaObj any if err := json.Unmarshal([]byte(tc.schema), &schemaObj); err != nil { return false, "invalid-test-schema: " + err.Error() } schemaBytes, _ := json.Marshal(schemaObj) ch, err := p.Stream(reqCtx, &flyto.Request{ Model: model, MaxTokens: 512, Messages: []flyto.Message{ {Role: flyto.RoleUser, Blocks: []flyto.Block{flyto.TextBlock(tc.prompt + " You MUST call the probe_tool function.")}}, }, Tools: []flyto.Tool{ { Name: "probe_tool", Description: "A probe tool for testing JSON Schema feature support. You MUST call this tool.", InputSchema: json.RawMessage(schemaBytes), }, }, }) if err != nil { msg := err.Error() if len(msg) > 100 { msg = msg[:100] + "..." } return false, "connect-err: " + msg } var gotToolUse bool var toolInput map[string]any var streamErr string var textFallback strings.Builder for evt := range ch { switch e := evt.(type) { case *flyto.ToolUseEvent: gotToolUse = true toolInput = e.Input case *flyto.TextDeltaEvent: textFallback.WriteString(e.Text) case *flyto.ErrorEvent: streamErr = e.Err.Error() if len(streamErr) > 100 { streamErr = streamErr[:100] + "..." } } } if streamErr != "" { return false, "stream-err: " + streamErr } if !gotToolUse { preview := strings.TrimSpace(textFallback.String()) if len(preview) > 80 { preview = preview[:80] + "..." } if preview != "" { return false, fmt.Sprintf("no tool_use (text: %q)", preview) } return false, "no tool_use event" } if toolInput == nil { return true, "tool called, input=nil" } inputBytes, _ := json.Marshal(toolInput) preview := string(inputBytes) if len(preview) > 80 { preview = preview[:80] + "..." } return true, fmt.Sprintf("input=%s", preview) } // printComparisonTable 输出多模型对照 Markdown 表格. // // 升华改进(ELEVATED): 不只是单模型列表-- // 横向对照每个特性在各模型上的支持状态,直接暴露差异, // 减少逐一阅读每个模型结果的认知成本. func printComparisonTable(allResults []modelResult) { // 按特性名建索引 caseNames := make([]string, len(schemaCases)) for i, tc := range schemaCases { caseNames[i] = tc.name } fmt.Println() fmt.Println("## JSON Schema 特性支持对照表") fmt.Println() // 表头 header := "| Schema 特性 | 描述 |" separator := "|-------------|------|" for _, mr := range allResults { // 截短模型名(去掉 provider 前缀) short := mr.model if idx := strings.LastIndex(short, "/"); idx >= 0 { short = short[idx+1:] } header += fmt.Sprintf(" %s |", short) separator += "----------|" } fmt.Println(header) fmt.Println(separator) // 每行一个 schema 特性 for i, tc := range schemaCases { row := fmt.Sprintf("| `%s` | %s |", tc.feature, tc.description) for _, mr := range allResults { if i < len(mr.results) { r := mr.results[i] if r.ok { row += " ✓ |" } else { row += " ✗ |" } } else { row += " - |" } } fmt.Println(row) } fmt.Println() // 分模型详细诊断 for _, mr := range allResults { fmt.Printf("### %s — 诊断详情\n\n", mr.model) fmt.Printf("%-28s %-10s %s\n", "特性", "结果", "诊断") fmt.Printf("%s\n", strings.Repeat("-", 100)) for _, r := range mr.results { status := "✓" if !r.ok { status = "✗" } fmt.Printf("%-28s %-10s %s\n", r.name, status, r.note) } fmt.Println() } } // loadEnvFile 从文件加载 KEY=VALUE 到环境变量(已存在的不覆盖). func loadEnvFile(path string) { f, err := os.Open(path) if err != nil { return } defer f.Close() scanner := bufio.NewScanner(f) for scanner.Scan() { line := strings.TrimSpace(scanner.Text()) if line == "" || strings.HasPrefix(line, "#") { continue } parts := strings.SplitN(line, "=", 2) if len(parts) != 2 { continue } key := strings.TrimSpace(parts[0]) val := strings.TrimSpace(parts[1]) if os.Getenv(key) == "" { os.Setenv(key, val) } } }