package security

// secret_scanner.go - SecretGuard 接口和默认实现.
//
// 核心设计思路:
//
//	"默认全路径扫描,显式豁免收窄" 而非 "默认关闭,路径白名单打开".
//	原因:遗漏路径比豁免路径危险得多.
//
// 线程安全:DefaultSecretGuard 可安全地被多个 goroutine 并发使用.
// 规则的 sync.Once 编译保证编译只发生一次,后续调用无锁开销.

import (
	"fmt"
	"path/filepath"
	"regexp"
	"strings"
	"sync"
)

// MaxScanBytes 是单次内容扫描的默认最大字节数(512 KB).
//
// 精妙之处(CLEVER): 早期设计中缺少内容大小限制--这是一个 bug.
// 如果 Agent 写入 100MB 的日志文件,45 条正则会在其上跑完整 pass,
// 理论上耗时可达秒级(Go regexp 是 O(n*m) 最坏情况,m 是规则数).
// 512KB 是合理上限:足以覆盖绝大多数配置文件,源码,日志片段;
// 超过此限制的文件(数据导出,二进制 base64 等)通常不含人工录入的 key.
// 超限时返回 ErrContentTooLarge,调用方决定是拒绝还是放行并记录警告.
const MaxScanBytes = 512 * 1024 // 512 KB

// ErrContentTooLarge 在内容超过 MaxScanBytes 时由 Scan 返回.
// 调用方可以选择:
//  1. 拒绝写入(保守)
//  2. 放行并写入审计警告(宽松)
var ErrContentTooLarge = fmt.Errorf("security: content exceeds max scan size (%d bytes)", MaxScanBytes)

// SecretGuard 是秘密扫描的核心接口.
//
// 升华改进(ELEVATED): 接口设计同时服务三种场景:
//   - CLI:默认注入 DefaultSecretGuard,开箱即用
//   - SDK(Go 嵌入):上层应用可传入 NewSecretGuardWithRules(customRules) 添加行业规则
//   - HTTP API/SaaS:租户级别注入不同 guard,实现隔离
//
// 替代方案:<直接在工具中 hardcode 规则扫描> - 否决原因:无法扩展,无法测试.
//
// Shape: synchronous callback. Engine calls Scan synchronously before
// tool input / output crosses a trust boundary; consumer implementation
// returns detected secrets so the caller can redact or deny.
//
// 形态: 同步回调. 引擎在工具输入/输出跨越信任边界前同步调 Scan; 消费者实现
// 返回检测到的 secrets 让调用方脱敏或拒绝.
type SecretGuard interface {
	// Scan 扫描 content 中的敏感信息.
	// path 用于豁免规则判断(ExemptPaths 前缀匹配).
	// 返回命中的规则列表;返回 ErrContentTooLarge 表示内容超限;
	// 内容安全时返回 nil, nil.
	Scan(path, content string) ([]SecretMatch, error)

	// Redact 将 content 中匹配到的秘密替换为 [REDACTED],返回脱敏后的字符串.
	// 用于日志输出,错误消息展示时的保护性处理.
	// 不返回 error--脱敏本身不应阻断流程;超限内容直接原样返回(不脱敏).
	Redact(content string) string
}

// compiledRule 是编译后的规则(加速扫描用).
type compiledRule struct {
	id    string
	label string
	re    *regexp.Regexp
}

// DefaultSecretGuard 是默认的秘密扫描实现.
//
// 零值不可用,必须通过 NewDefaultSecretGuard 或 NewSecretGuardWithRules 创建.
type DefaultSecretGuard struct {
	// rules 是原始规则(用于 Redact 时重新编译带 global flag 的正则).
	rules []SecretRule

	// ExemptPaths 是豁免路径列表(路径前缀,如 "testdata/","/tmp/fixtures/").
	// 路径匹配任意一个前缀时跳过扫描,直接返回 nil, nil.
	// 精妙之处(CLEVER): 用前缀匹配而非 glob,避免引入 glob 解析复杂度.
	// 绝大多数豁免场景都是目录前缀(testdata/,fixtures/ 等),前缀足够.
	ExemptPaths []string

	// MaxBytes 覆盖默认的 MaxScanBytes.0 表示使用默认值.
	MaxBytes int

	// OnBlocked 是检测到秘密时的回调(可选).
	// 由引擎层注入--在 engine.New() 中设置为触发 "secret_scan_blocked" Observer 事件.
	// 工具层(FileWrite/FileEdit)不需要直接依赖 Observer,避免循环导入.
	//
	// 精妙之处(CLEVER): 回调而非直接依赖 EventObserver 接口--
	// security 包不导入 engine 包,engine 包在构造时将 observer.Event 注入为回调.
	// 这样 security 包保持零引擎依赖,可以被 pkg/memory 等其他包直接使用.
	// 替代方案:<security 包导入 engine.EventObserver 接口>
	// - 否决原因:形成循环导入(engine 包已经导入 security 包).
	OnBlocked func(path string, matches []SecretMatch)

	// 惰性编译缓存--只在首次 Scan 调用时初始化,之后无锁读取.
	// 精妙之处(CLEVER): sync.Once 保证并发安全的单次初始化,
	// 比 sync.Mutex 加锁检查快(热路径无锁).
	once         sync.Once
	compiled     []compiledRule
	redactRegexp []*regexp.Regexp // 带 global flag 的版本,供 Redact 用
}

// NewDefaultSecretGuard 创建使用内置 45 条规则的默认扫描器.
// 无豁免路径,对所有路径生效.
func NewDefaultSecretGuard() *DefaultSecretGuard {
	return &DefaultSecretGuard{rules: builtinRules}
}

// NewSecretGuardWithRules 创建使用自定义规则集的扫描器.
// 典型用法:在内置规则基础上追加行业特有规则.
//
//	guard := security.NewSecretGuardWithRules(
//	    append(security.BuiltinRules(), myInternalRules...)
//	)
func NewSecretGuardWithRules(rules []SecretRule) *DefaultSecretGuard {
	return &DefaultSecretGuard{rules: rules}
}

// NoopSecretGuard 是空实现--不扫描任何内容,直接放行.
// 用于测试场景或明确关闭扫描的场景(必须显式声明,不能忘记).
//
// 精妙之处(CLEVER): 与 NoopObserver 同理--显式 Noop 比 nil 检查更安全.
// 调用方传入 security.NoopSecretGuard{} 而非 nil,避免 nil 解引用 panic.
type NoopSecretGuard struct{}

func (NoopSecretGuard) Scan(_, _ string) ([]SecretMatch, error) { return nil, nil }
func (NoopSecretGuard) Redact(content string) string            { return content }

// ─── DefaultSecretGuard 实现 ──────────────────────────────────────────────────

// maxBytes 返回实际使用的字节上限.
func (g *DefaultSecretGuard) maxBytes() int {
	if g.MaxBytes > 0 {
		return g.MaxBytes
	}
	return MaxScanBytes
}

// compile 惰性初始化编译后的规则(线程安全,只执行一次).
func (g *DefaultSecretGuard) compile() {
	g.once.Do(func() {
		g.compiled = make([]compiledRule, 0, len(g.rules))
		g.redactRegexp = make([]*regexp.Regexp, 0, len(g.rules))

		for _, r := range g.rules {
			pattern := r.Source
			if r.Flags != "" {
				pattern = "(?" + r.Flags + ")" + pattern
			}
			re, err := regexp.Compile(pattern)
			if err != nil {
				// 历史包袱(LEGACY): 规则编译失败时静默跳过,不 panic.
				// 理想做法是在初始化时校验所有规则,但当前规则来自内置常量,
				// 不会失败.外部自定义规则传入错误正则时,跳过比崩溃更安全.
				// 未来改进:NewSecretGuardWithRules 提供 Validate() 返回 error.
				continue
			}
			g.compiled = append(g.compiled, compiledRule{
				id:    r.ID,
				label: ruleIDToLabel(r.ID),
				re:    re,
			})

			// 为 Redact 准备带 global flag 的版本(替换所有匹配项).
			// 精妙之处(CLEVER): 扫描用的正则不带 g flag(找到就停),
			// Redact 用的正则必须带 g flag(替换所有出现).分开编译,职责清晰.
			globalPattern := pattern
			re2, err := regexp.Compile(globalPattern)
			if err != nil {
				re2 = re // 退化为非 global
			}
			g.redactRegexp = append(g.redactRegexp, re2)
		}
	})
}

// isExempt 检查路径是否在豁免列表中.
func (g *DefaultSecretGuard) isExempt(path string) bool {
	// 路径规范化:统一用正斜杠,方便前缀匹配
	normalized := filepath.ToSlash(path)
	for _, exempt := range g.ExemptPaths {
		if strings.HasPrefix(normalized, filepath.ToSlash(exempt)) {
			return true
		}
	}
	return false
}

// Scan 扫描内容中的敏感信息.
// path 用于豁免检查;content 超过 MaxBytes 时返回 ErrContentTooLarge.
func (g *DefaultSecretGuard) Scan(path, content string) ([]SecretMatch, error) {
	// 豁免路径直接放行
	if g.isExempt(path) {
		return nil, nil
	}

	// 内容大小限制
	if len(content) > g.maxBytes() {
		return nil, ErrContentTooLarge
	}

	g.compile()

	var matches []SecretMatch
	seen := make(map[string]struct{}, len(g.compiled))

	for _, rule := range g.compiled {
		// 精妙之处(CLEVER): 同一规则只报告一次(用 seen set 去重).
		// 文件中同一类型的 key 出现多次(如测试文件里的多个 GitHub PAT)
		// 没有必要报告 N 次--用户只需要知道"这里有 GitHub PAT".
		if _, ok := seen[rule.id]; ok {
			continue
		}
		if rule.re.MatchString(content) {
			seen[rule.id] = struct{}{}
			matches = append(matches, SecretMatch{
				RuleID: rule.id,
				Label:  rule.label,
			})
		}
	}

	// 检测到秘密时触发回调(引擎层用于发送 "secret_scan_blocked" Observer 事件).
	// 精妙之处(CLEVER): 回调在返回 matches 前触发--调用方可以用 matches 做后续决策,
	// 但回调先行,确保审计事件在拦截发生时即刻记录,而不是"如果调用方决定拦截再记".
	// 二者形成"先审计后决策"的顺序,合规场景中不允许先决策再补记审计.
	if len(matches) > 0 && g.OnBlocked != nil {
		g.OnBlocked(path, matches)
	}

	return matches, nil
}

// Redact 将内容中匹配的秘密替换为 [REDACTED].
// 超过 MaxBytes 的内容直接原样返回(不脱敏)--脱敏失败比暴露原文更可接受.
func (g *DefaultSecretGuard) Redact(content string) string {
	if len(content) > g.maxBytes() {
		return content
	}

	g.compile()

	for i, re := range g.redactRegexp {
		_ = i
		// 精妙之处(CLEVER): 早期实现的 replace 回调只替换捕获组(g1),
		// 保留边界字符(引号,空格,分号).Go 的 ReplaceAllStringFunc
		// 作用在整个 match 上,需要用 ReplaceAllString + 命名捕获组 $1 实现.
		//
		// 但 Go re2 不支持回调替换部分捕获组.我们改用 FindStringSubmatchIndex
		// 手动重组:保留 match[0:start(group1)] + [REDACTED] + match[end(group1):]
		//
		// 如果规则没有捕获组(如 github-pat 直接匹配整个 token),
		// 则替换整个 match.
		content = redactWithGroup(re, content)
	}

	return content
}

// redactWithGroup 替换正则的第一个捕获组(如果有),否则替换整个匹配.
// 这样可以保留边界字符(引号,空格,分号等不属于 key 的字符).
func redactWithGroup(re *regexp.Regexp, content string) string {
	return re.ReplaceAllStringFunc(content, func(match string) string {
		// 找子匹配(第 0 组是整体,第 1 组是第一个捕获)
		subs := re.FindStringSubmatch(match)
		if len(subs) < 2 || subs[1] == "" {
			// 没有捕获组,直接替换整体
			return "[REDACTED]"
		}
		// 只替换第一个捕获组,保留边界字符
		return strings.Replace(match, subs[1], "[REDACTED]", 1)
	})
}

// MatchLabels 将 SecretMatch 列表转换为可读标签列表(用于错误消息).
func MatchLabels(matches []SecretMatch) []string {
	labels := make([]string, len(matches))
	for i, m := range matches {
		labels[i] = m.Label
	}
	return labels
}

// MatchRuleIDs 将 SecretMatch 列表转换为规则 ID 列表(用于审计日志).
func MatchRuleIDs(matches []SecretMatch) []string {
	ids := make([]string, len(matches))
	for i, m := range matches {
		ids[i] = m.RuleID
	}
	return ids
}