// Package tokenizer 实现 token estimator 的近似版本.
//
// 比简单的 4 chars/token 估算准确得多:
//   - 英文按空格和标点分词,每个词约 1-1.5 token
//   - 代码按标识符边界分词,长标识符约 1-2 token
//   - CJK 字符每个约 2 token(保守估算, 宁高勿低)
//   - 空白和标点通常与前后合并
//   - JSON/结构化数据按 key-value 对估算
//
// 不需要引入外部 tokenizer 依赖,但比固定比例估算准确 2-3 倍.
package tokenizer

import (
	"encoding/json"
	"strings"
	"unicode"
)

// 各模型的上下文窗口和定价信息.
type ModelInfo struct {
	// ContextWindow 上下文窗口大小(token 数)
	ContextWindow int
	// InputPricePerMillion 每百万 input token 的价格(美元)
	InputPricePerMillion float64
	// OutputPricePerMillion 每百万 output token 的价格(美元)
	OutputPricePerMillion float64
}

// ModelPricing 是模型定价/上下文窗口查找表.
//
// 已清空:定价数据由 pkg/config/models.go 的 ModelRegistry 统一管理.
// EstimateCost() 未知模型返回 0,ContextWindow() 未知模型返回默认值 200000.
// 精确定价请使用 EstimateCostWithPricing() 或 config.ModelRegistry.EstimateCost().
var ModelPricing = map[string]ModelInfo{}

// Message 是用于估算 token 的消息格式.
// 与 api.RequestMessage 解耦,避免循环依赖.
type Message struct {
	Role    string `json:"role"`
	Content string `json:"content"`
}

// EstimateTokens 估算文本的 token 数.
//
// 策略:
//   - 遍历文本,根据字符类型分段计数
//   - 英文单词:按空格分隔,短词(<=4字符)约 1 token,长词按 ceil(len/4) token
//   - CJK 字符:每个字符约 1.5 token(向上取整,即 2个CJK = 3 token)
//   - 数字序列:按长度/3 估算(数字编码效率较高)
//   - 标点符号:通常与前后合并,约 0.5 token
//   - 空白符:通常被合并到相邻 token,不单独计数
//   - 代码标识符中的 camelCase/snake_case 边界会拆分
func EstimateTokens(text string) int {
	if len(text) == 0 {
		return 0
	}

	tokens := 0
	i := 0
	runes := []rune(text)
	n := len(runes)

	for i < n {
		ch := runes[i]

		// 跳过空白符(通常被合并到相邻 token)
		if unicode.IsSpace(ch) {
			i++
			continue
		}

		// CJK 字符:每个字符约 2 token.
		// 精妙之处(CLEVER): 选 2x 而非 1.5x 的理由 --
		// BPE tokenizer 对 CJK 的 UTF-8 编码(3 字节/字符)做 byte-pair 合并,
		// 常用汉字(词表收录)= 1 token, 生僻字(byte fallback)= 2-3 token.
		// 1.5x 对生僻字低估有截断风险; 2x 偏保守但在 200K 窗口下高估无感,
		// 且 token budget 管理宁可高估(提前 compact)不可低估(请求被截断).
		// 精确计数由 API 返回的 usage.input_tokens 负责, 这里只管 budget 安全.
		if isCJK(ch) {
			cjkCount := 0
			for i < n && isCJK(runes[i]) {
				cjkCount++
				i++
			}
			tokens += cjkCount * 2
			continue
		}

		// 数字序列
		if unicode.IsDigit(ch) {
			digitCount := 0
			for i < n && (unicode.IsDigit(runes[i]) || runes[i] == '.' || runes[i] == ',') {
				digitCount++
				i++
			}
			// 数字编码效率较高,约 3 个字符 1 个 token
			t := (digitCount + 2) / 3
			if t < 1 {
				t = 1
			}
			tokens += t
			continue
		}

		// 英文字母 / 标识符
		if unicode.IsLetter(ch) {
			word := collectWord(runes, &i, n)
			tokens += estimateWordTokens(word)
			continue
		}

		// 标点和特殊字符
		// JSON/代码中的结构字符({}[],:"")通常被高效编码
		if isStructuralChar(ch) {
			// 结构字符通常 2-3 个合并为一个 token
			structCount := 0
			for i < n && isStructuralChar(runes[i]) {
				structCount++
				i++
			}
			t := (structCount + 1) / 2
			if t < 1 {
				t = 1
			}
			tokens += t
			continue
		}

		// 其他标点符号:约 1 token
		tokens++
		i++
	}

	if tokens < 1 && len(text) > 0 {
		tokens = 1
	}

	return tokens
}

// EstimateMessageTokens 估算消息列表的 token 数.
// 每条消息包含 role 标签的开销(约 3-4 token).
func EstimateMessageTokens(messages []Message) int {
	if len(messages) == 0 {
		return 0
	}

	total := 0
	for _, msg := range messages {
		// role 标签开销:<role>\n 约 3-4 token
		total += 4
		total += EstimateTokens(msg.Content)
	}

	// 消息序列本身的格式开销(system prompt 标签等)
	total += 3

	return total
}

// EstimateRawMessageTokens 估算带 json.RawMessage 的消息列表的 token 数.
// 用于兼容 compact.go 的 CompactMessage 格式.
func EstimateRawMessageTokens(roles []string, contents []json.RawMessage) int {
	if len(roles) == 0 {
		return 0
	}

	total := 0
	for i, content := range contents {
		// role 标签开销
		_ = i
		total += 4

		// 尝试解析为字符串
		var text string
		if err := json.Unmarshal(content, &text); err == nil {
			total += EstimateTokens(text)
		} else {
			// 不是简单字符串,按 JSON 估算
			total += EstimateTokens(string(content))
		}
	}

	total += 3
	return total
}

// EstimateCost 估算 API 调用成本(美元).
// 使用内部定价表查询.未知模型返回 0(与 config.EstimateCost 一致).
// 如果需要更精确的定价(含缓存),请使用 config.ModelRegistry.EstimateCost().
func EstimateCost(inputTokens, outputTokens int, model string) float64 {
	info, ok := ModelPricing[model]
	if !ok {
		return 0
	}

	inputCost := float64(inputTokens) * info.InputPricePerMillion / 1_000_000
	outputCost := float64(outputTokens) * info.OutputPricePerMillion / 1_000_000

	return inputCost + outputCost
}

// EstimateCostWithPricing 使用外部提供的定价信息估算成本.
// 这允许消费层传入 ModelRegistry 中的定价,避免依赖内部硬编码表.
func EstimateCostWithPricing(inputTokens, outputTokens int, inputPricePerMillion, outputPricePerMillion float64) float64 {
	inputCost := float64(inputTokens) * inputPricePerMillion / 1_000_000
	outputCost := float64(outputTokens) * outputPricePerMillion / 1_000_000
	return inputCost + outputCost
}

// ContextWindow 返回指定模型的上下文窗口大小.
func ContextWindow(model string) int {
	if info, ok := ModelPricing[model]; ok {
		return info.ContextWindow
	}
	return 200000 // 默认值
}

// --- 内部辅助函数 ---

// isCJK 判断是否为 CJK(中日韩)字符.
func isCJK(r rune) bool {
	return unicode.Is(unicode.Han, r) ||
		unicode.Is(unicode.Hiragana, r) ||
		unicode.Is(unicode.Katakana, r) ||
		unicode.Is(unicode.Hangul, r) ||
		(r >= 0x3000 && r <= 0x303F) || // CJK 标点
		(r >= 0xFF00 && r <= 0xFFEF) // 全角字符
}

// isStructuralChar 判断是否为 JSON/代码结构字符.
func isStructuralChar(r rune) bool {
	switch r {
	case '{', '}', '[', ']', '(', ')', ':', ',', ';', '"', '\'', '`':
		return true
	}
	return false
}

// collectWord 从当前位置收集一个连续的字母/数字/下划线序列.
func collectWord(runes []rune, i *int, n int) string {
	var sb strings.Builder
	for *i < n {
		ch := runes[*i]
		if unicode.IsLetter(ch) || unicode.IsDigit(ch) || ch == '_' || ch == '-' {
			sb.WriteRune(ch)
			*i++
		} else {
			break
		}
	}
	return sb.String()
}

// estimateWordTokens 估算一个英文单词/标识符的 token 数.
//
// BPE tokenizer 的典型行为:
//   - 常见英文短词(the, is, and, for 等)= 1 token
//   - 常见英文单词(function, return, import 等)= 1 token
//   - 中等长度词(variable, parameter)= 1-2 token
//   - 长词(implementation)= 2-3 token
//   - 代码标识符中的 camelCase 会在边界处拆分
//   - snake_case 的下划线通常作为分隔符
func estimateWordTokens(word string) int {
	if len(word) == 0 {
		return 0
	}

	// 先按 camelCase 和 snake_case 边界拆分
	parts := splitIdentifier(word)

	total := 0
	for _, part := range parts {
		partLen := len(part)
		switch {
		case partLen <= 0:
			continue
		case partLen <= 4:
			// 短单词或子词:通常 1 token
			total += 1
		case partLen <= 8:
			// 中等长度:1-2 token
			total += 1
			// 如果不是常见词,可能是 2 token
			if !isCommonWord(part) {
				total += 1
			}
		case partLen <= 12:
			// 较长词:2 token
			total += 2
		default:
			// 超长词:按约 5 字符/token 估算
			total += (partLen + 4) / 5
		}
	}

	if total < 1 {
		total = 1
	}

	return total
}

// splitIdentifier 将标识符按 camelCase / snake_case / kebab-case 边界拆分.
func splitIdentifier(s string) []string {
	if len(s) == 0 {
		return nil
	}

	var parts []string
	var current strings.Builder
	runes := []rune(s)

	for i, ch := range runes {
		if ch == '_' || ch == '-' {
			// snake_case 或 kebab-case 分隔符
			if current.Len() > 0 {
				parts = append(parts, current.String())
				current.Reset()
			}
			continue
		}

		// camelCase 边界检测:小写后面跟大写
		if i > 0 && unicode.IsUpper(ch) && !unicode.IsUpper(runes[i-1]) {
			if current.Len() > 0 {
				parts = append(parts, current.String())
				current.Reset()
			}
		}

		// 连续大写后面跟小写(如 HTTPClient -> HTTP + Client)
		if i > 1 && unicode.IsUpper(runes[i-1]) && unicode.IsUpper(runes[i-2]) && unicode.IsLower(ch) {
			str := current.String()
			if len(str) > 1 {
				parts = append(parts, str[:len(str)-1])
				current.Reset()
				current.WriteByte(str[len(str)-1])
			}
		}

		current.WriteRune(ch)
	}

	if current.Len() > 0 {
		parts = append(parts, current.String())
	}

	return parts
}

// isCommonWord 判断是否为常见英文词(通常只占 1 token).
var commonWords = map[string]bool{
	// 常见英文词
	"the": true, "is": true, "and": true, "for": true, "with": true,
	"that": true, "this": true, "from": true, "not": true, "but": true,
	"are": true, "was": true, "has": true, "have": true, "had": true,
	"will": true, "can": true, "would": true, "should": true, "could": true,
	"been": true, "being": true, "does": true, "did": true, "done": true,
	"which": true, "when": true, "where": true, "what": true, "how": true,
	"then": true, "than": true, "some": true, "each": true, "only": true,
	"also": true, "into": true, "over": true, "after": true, "before": true,
	"about": true, "between": true, "through": true,

	// 常见编程关键字
	"func": true, "function": true, "return": true, "import": true,
	"export": true, "class": true, "struct": true, "type": true,
	"const": true, "var": true, "let": true, "if": true, "else": true,
	"switch": true, "case": true, "break": true, "continue": true,
	"while": true, "range": true, "defer": true, "error": true,
	"string": true, "int": true, "bool": true, "nil": true, "null": true,
	"true": true, "false": true, "new": true, "make": true, "append": true,
	"package": true, "interface": true, "map": true, "chan": true,
}

func isCommonWord(word string) bool {
	return commonWords[strings.ToLower(word)]
}