package daemon import ( "context" "sync/atomic" "testing" "time" ) // TestDaemonManager_HeartbeatDisabled_NoService locks sub-claim (a-neg): // when DaemonConfig.HeartbeatInterval is zero, no HeartbeatService is // constructed. This guards against "wire always active" regressions // that would leak a goroutine even when the operator explicitly // disabled heartbeat via interval=0. // // 锁 sub-claim (a-neg): DaemonConfig.HeartbeatInterval 为 0 时 // 不构造 HeartbeatService. 防御 "wire 总是激活" 类回归 -- // 当操作员用 interval=0 显式关闭心跳时不能泄漏 goroutine. func TestDaemonManager_HeartbeatDisabled_NoService(t *testing.T) { dm := NewDaemonManager(nil, nil, DaemonConfig{}) if dm.heartbeat != nil { t.Fatalf("dm.heartbeat = %v, want nil when HeartbeatInterval=0", dm.heartbeat) } } // TestDaemonManager_HeartbeatEnabled_ForwardsConfig locks sub-claim (a): // cfg.HeartbeatInterval AND cfg.HeartbeatTimeout are both read and // forwarded into the HeartbeatConfig carried by the constructed // HeartbeatService. This is the single production read-site for // both DaemonConfig fields -- if the values drift here the runtime // scan frequency / staleness threshold drift silently. // // 锁 sub-claim (a): cfg.HeartbeatInterval 和 cfg.HeartbeatTimeout // 都被读取并透传到所构造 HeartbeatService 的 HeartbeatConfig. // 这是两个 DaemonConfig 字段的唯一产线读点 -- 若此处值漂移, // 运行时的扫描频率/超时阈值会悄然偏离. func TestDaemonManager_HeartbeatEnabled_ForwardsConfig(t *testing.T) { const ( wantInterval = 7 * time.Second wantTimeout = 21 * time.Second ) dm := NewDaemonManager(nil, nil, DaemonConfig{ HeartbeatInterval: wantInterval, HeartbeatTimeout: wantTimeout, }) if dm.heartbeat == nil { t.Fatalf("dm.heartbeat = nil, want non-nil when HeartbeatInterval>0") } if got := dm.heartbeat.cfg.Interval; got != wantInterval { t.Errorf("dm.heartbeat.cfg.Interval = %v, want %v (cfg.HeartbeatInterval not forwarded)", got, wantInterval) } if got := dm.heartbeat.cfg.Timeout; got != wantTimeout { t.Errorf("dm.heartbeat.cfg.Timeout = %v, want %v (cfg.HeartbeatTimeout not forwarded)", got, wantTimeout) } } // TestDaemonManager_CloseSession_UnregistersFromHeartbeat locks the // sub-claim that DaemonManager lifecycle hooks actually flow through // the heartbeat surface: closeSession must Unregister or the scan // thread will fire closeSession again on the next tick for a session // that no longer exists (benign but noisy). We seed the heartbeat // map directly via the public Register API (as DaemonManager does in // getOrCreateSession) and a stub sessions entry so closeSession can // navigate without a real engine/transport, then assert the heartbeat // no longer tracks the ID. // // 锁 DaemonManager 生命周期钩子真的流到 heartbeat 面: closeSession // 必须 Unregister, 否则扫描线程下次 tick 会对已不存在的会话再调 // closeSession (无害但日志噪音). 用 public Register API 先注册 // (模拟 getOrCreateSession 的行为) + 塞一个 stub sessions 条目, // 让 closeSession 能在没有真 engine/transport 时跑完, 最后断言 // heartbeat 不再追踪该 ID. func TestDaemonManager_CloseSession_UnregistersFromHeartbeat(t *testing.T) { dm := NewDaemonManager(nil, nil, DaemonConfig{ HeartbeatInterval: 1 * time.Second, HeartbeatTimeout: 3 * time.Second, }) if dm.heartbeat == nil { t.Fatalf("prerequisite: heartbeat should be non-nil") } const sid = "test-session-close" // Seed as if getOrCreateSession had run (minimum state closeSession // can traverse without a real engine / transport / isolation). // 模拟 getOrCreateSession 跑过后的最小状态, 让 closeSession // 能在没有真 engine/transport/isolation 时跑完. dm.sessions[sid] = &ManagedSession{isolation: SharedIsolation{}} dm.heartbeat.Register(sid) if n := countHeartbeatSessions(dm.heartbeat); n != 1 { t.Fatalf("heartbeat should track 1 session before close, got %d", n) } dm.closeSession(sid) if n := countHeartbeatSessions(dm.heartbeat); n != 0 { t.Errorf("heartbeat should track 0 sessions after closeSession, got %d", n) } } // countHeartbeatSessions is a white-box helper for _test.go only -- // the package-internal sessions map is not part of the public API. // 仅 _test.go 使用的白盒辅助 -- 包内 sessions map 非公共 API. func countHeartbeatSessions(hs *HeartbeatService) int { hs.mu.RLock() defer hs.mu.RUnlock() return len(hs.sessions) } // TestDaemonManager_CrashRecoveryDisabled_NoWrapper locks sub-claim // (a-neg): when DaemonConfig.CrashRecovery is nil, no CrashRecovery // wrapper is constructed. Mirrors the Heartbeat-disabled invariant: // operator opts in via non-nil pointer; zero-value config must not // silently engage a retry policy the operator didn't choose. // // 锁 sub-claim (a-neg): DaemonConfig.CrashRecovery 为 nil 时 // 不构造 CrashRecovery 包装器. 与 Heartbeat-disabled 不变量一致: // 运维通过非 nil 指针选择启用; 零值配置不能悄然启一套运维未选的 // 重试策略. func TestDaemonManager_CrashRecoveryDisabled_NoWrapper(t *testing.T) { dm := NewDaemonManager(nil, nil, DaemonConfig{}) if dm.crashRecovery != nil { t.Fatalf("dm.crashRecovery = %v, want nil when cfg.CrashRecovery=nil", dm.crashRecovery) } } // TestDaemonManager_CrashRecoveryEnabled_ForwardsConfig locks // sub-claim (a): NewDaemonManager dereferences cfg.CrashRecovery and // forwards its fields (MaxRetries + InitialDelay + MaxDelay + // Multiplier) into the CrashRecovery constructor. If any field // drifts here, the runtime retry policy drifts silently -- the // single production read-site must faithfully carry operator // intent. // // 锁 sub-claim (a): NewDaemonManager 解引用 cfg.CrashRecovery // 并把字段 (MaxRetries + InitialDelay + MaxDelay + Multiplier) // 透传到 CrashRecovery 构造器. 若此处字段漂移, 运行时重试策略 // 悄然偏离 -- 唯一产线读点必须忠实承载运维意图. func TestDaemonManager_CrashRecoveryEnabled_ForwardsConfig(t *testing.T) { policy := &CrashRecoveryConfig{ MaxRetries: 5, InitialDelay: 2 * time.Second, MaxDelay: 30 * time.Second, Multiplier: 3.0, } dm := NewDaemonManager(nil, nil, DaemonConfig{CrashRecovery: policy}) if dm.crashRecovery == nil { t.Fatalf("dm.crashRecovery = nil, want non-nil when cfg.CrashRecovery != nil") } got := dm.crashRecovery.cfg if got.MaxRetries != policy.MaxRetries { t.Errorf("crashRecovery.cfg.MaxRetries = %d, want %d", got.MaxRetries, policy.MaxRetries) } if got.InitialDelay != policy.InitialDelay { t.Errorf("crashRecovery.cfg.InitialDelay = %v, want %v", got.InitialDelay, policy.InitialDelay) } if got.MaxDelay != policy.MaxDelay { t.Errorf("crashRecovery.cfg.MaxDelay = %v, want %v", got.MaxDelay, policy.MaxDelay) } if got.Multiplier != policy.Multiplier { t.Errorf("crashRecovery.cfg.Multiplier = %v, want %v", got.Multiplier, policy.Multiplier) } } // TestRunWithRecover_NormalReturnsNil locks the happy path of the // panic-to-error bridge: normal fn completion must return nil so // CrashRecovery does not retry a perfectly successful prompt run. // This is what prevents "every completed prompt triggers 3 bonus // retries" regression. // // 锁 panic-to-error 桥接的正常路径: fn 正常完成必须返回 nil, // 避免 CrashRecovery 对成功的 prompt 多跑 3 次重试. 防御 // "每次 prompt 完成都附赠 3 次重试" 类回归. func TestRunWithRecover_NormalReturnsNil(t *testing.T) { called := false err := runWithRecover(func() { called = true }) if err != nil { t.Errorf("runWithRecover returned err=%v for normal fn, want nil", err) } if !called { t.Errorf("runWithRecover did not invoke fn") } } // TestRunWithRecover_PanicToError locks the crash-reification side: // a panicked fn must surface as a non-nil error, otherwise // CrashRecovery can never fire. The error is intentionally opaque // (no special type) because it's consumed only by CrashRecovery's // generic error-means-retry logic. // // 锁崩溃转 error 方向: fn panic 必须冒泡成 non-nil error, // 否则 CrashRecovery 永不触发. error 内容有意保持不透明 // (无特殊类型), 因为只被 CrashRecovery 的通用 "error=重试" // 逻辑消费. func TestRunWithRecover_PanicToError(t *testing.T) { err := runWithRecover(func() { panic("boom") }) if err == nil { t.Fatalf("runWithRecover returned nil for panicking fn, want non-nil") } } // TestCrashRecovery_WrapsPanickingPromptRun locks the end-to-end // sub-claim: runWithRecover + CrashRecovery.RunWithRecovery compose // into a real retry loop when a prompt handler panics. We use the // same CrashRecovery the daemon would build (via // NewDaemonManager) and assert OnCrash fires once per attempt, // OnGiveUp fires once at the end, and the panicking fn is called // exactly MaxRetries+1 times (first attempt + retries). This is the // critical evidence that dm.crashRecovery is *consumed*, not merely // stored. // // 锁端到端 sub-claim: runWithRecover + CrashRecovery.RunWithRecovery // 对 panic 的 prompt 处理组合成真重试循环. 用 NewDaemonManager // 构造的同一个 CrashRecovery 实例, 断言 OnCrash 每次 attempt 触发 // 一次, OnGiveUp 结束时触发一次, panic 的 fn 被调用 MaxRetries+1 // 次 (首次 + 重试). 这是 dm.crashRecovery 被 *消费* 而非仅存储 // 的关键证据. func TestCrashRecovery_WrapsPanickingPromptRun(t *testing.T) { var crashCount, giveUpCount, invokeCount atomic.Int32 policy := &CrashRecoveryConfig{ MaxRetries: 2, InitialDelay: 1 * time.Millisecond, MaxDelay: 2 * time.Millisecond, Multiplier: 2.0, OnCrash: func(_ string, _ int, _ error) { crashCount.Add(1) }, OnGiveUp: func(_ string, _ int) { giveUpCount.Add(1) }, } dm := NewDaemonManager(nil, nil, DaemonConfig{CrashRecovery: policy}) if dm.crashRecovery == nil { t.Fatalf("prerequisite: dm.crashRecovery should be non-nil") } runFn := func() error { return runWithRecover(func() { invokeCount.Add(1) panic("simulated prompt crash") }) } err := dm.crashRecovery.RunWithRecovery(context.Background(), "test-session", runFn) if err == nil { t.Fatalf("RunWithRecovery returned nil, want non-nil (fn always panics)") } // MaxRetries=2 means first attempt + 2 retries = 3 total invokes, // 3 OnCrash callbacks, and 1 OnGiveUp at the end. // MaxRetries=2 意味 首次 + 2 次重试 = 3 次调用, 3 次 OnCrash, // 结束时 1 次 OnGiveUp. if got := invokeCount.Load(); got != 3 { t.Errorf("invokeCount = %d, want 3 (first attempt + MaxRetries=2 retries)", got) } if got := crashCount.Load(); got != 3 { t.Errorf("crashCount = %d, want 3 (one per attempt)", got) } if got := giveUpCount.Load(); got != 1 { t.Errorf("giveUpCount = %d, want 1", got) } }