// tokenizer_test.go -- Token 估算器的单元测试.
//
// 覆盖场景:
//   - EstimateTokens 各种文本类型
//   - 英文文本估算
//   - CJK 字符估算
//   - 代码文本估算
//   - EstimateMessageTokens 消息列表估算
//   - EstimateCost 成本估算
//   - ContextWindow 上下文窗口大小
//   - splitIdentifier 标识符拆分
//   - isCommonWord 常见词判断
package tokenizer

import (
	"encoding/json"
	"testing"
)

// TestEstimateTokens_English 测试英文文本 token 估算
func TestEstimateTokens_English(t *testing.T) {
	text := "The quick brown fox jumps over the lazy dog"
	tokens := EstimateTokens(text)

	// 9 个单词,大约 9-12 token
	if tokens < 5 || tokens > 20 {
		t.Errorf("英文句子估算 token 不合理: %d (期望 5-20)", tokens)
	}
}

// TestEstimateTokens_CJK 测试中文文本 token 估算
func TestEstimateTokens_CJK(t *testing.T) {
	text := "你好世界这是测试"
	tokens := EstimateTokens(text)

	// 8 个中文字符,约 12 token (每字约 1.5)
	if tokens < 8 || tokens > 20 {
		t.Errorf("中文文本估算 token 不合理: %d (期望 8-20)", tokens)
	}
}

// TestEstimateTokens_Empty 测试空文本
func TestEstimateTokens_Empty(t *testing.T) {
	if EstimateTokens("") != 0 {
		t.Error("空文本应返回 0")
	}
}

// TestEstimateTokens_Code 测试代码文本
func TestEstimateTokens_Code(t *testing.T) {
	code := `func main() {
    fmt.Println("Hello, World!")
}`
	tokens := EstimateTokens(code)
	if tokens < 5 {
		t.Errorf("代码估算 token 太少: %d", tokens)
	}
}

// TestEstimateTokens_Numbers 测试数字序列
func TestEstimateTokens_Numbers(t *testing.T) {
	numbers := "1234567890"
	tokens := EstimateTokens(numbers)
	if tokens < 1 {
		t.Errorf("数字估算 token 不合理: %d", tokens)
	}
}

// TestEstimateMessageTokens 测试消息列表估算
func TestEstimateMessageTokens(t *testing.T) {
	messages := []Message{
		{Role: "user", Content: "Hello, how are you?"},
		{Role: "assistant", Content: "I am doing well, thank you!"},
	}

	tokens := EstimateMessageTokens(messages)
	// 两条消息的 token + 格式开销
	if tokens < 10 {
		t.Errorf("消息 token 估算太少: %d", tokens)
	}
}

// TestEstimateMessageTokens_Empty 测试空消息列表
func TestEstimateMessageTokens_Empty(t *testing.T) {
	if EstimateMessageTokens(nil) != 0 {
		t.Error("空消息列表应返回 0")
	}
}

// TestEstimateCost 测试成本估算
func TestEstimateCost(t *testing.T) {
	// ModelPricing 已清空,所有模型返回 0
	cost := EstimateCost(1000, 500, "claude-sonnet-4-6")
	if cost != 0 {
		t.Errorf("ModelPricing 已清空，期望成本为 0, 实际: %f", cost)
	}

	// 未知模型同样返回 0
	costUnknown := EstimateCost(1000, 500, "unknown-model")
	if costUnknown != 0 {
		t.Errorf("未知模型期望成本为 0, 实际: %f", costUnknown)
	}
}

// TestContextWindow 测试上下文窗口大小
func TestContextWindow(t *testing.T) {
	// ModelPricing 已清空,所有模型走默认值 200000
	if ContextWindow("claude-sonnet-4-6") != 200000 {
		t.Errorf("期望默认窗口 200000")
	}

	// 未知模型同样返回默认值
	if ContextWindow("unknown") != 200000 {
		t.Errorf("未知模型默认窗口应为 200000")
	}
}

// TestSplitIdentifier 测试标识符拆分
func TestSplitIdentifier(t *testing.T) {
	tests := []struct {
		name  string
		input string
		want  int // 期望的部分数
	}{
		{"camelCase", "camelCase", 2},
		{"snake_case", "snake_case", 2},
		{"kebab-case", "kebab-case", 2},
		{"单词", "hello", 1},
		{"HTTPClient", "HTTPClient", 2},
		{"空字符串", "", 0},
	}

	for _, tt := range tests {
		t.Run(tt.name, func(t *testing.T) {
			parts := splitIdentifier(tt.input)
			if len(parts) != tt.want {
				t.Errorf("splitIdentifier(%q) 返回 %d 部分, 期望 %d\n  parts: %v", tt.input, len(parts), tt.want, parts)
			}
		})
	}
}

// TestIsCommonWord 测试常见词判断
func TestIsCommonWord(t *testing.T) {
	common := []string{"the", "is", "and", "func", "return", "string", "nil"}
	for _, w := range common {
		if !isCommonWord(w) {
			t.Errorf("%q 应为常见词", w)
		}
	}

	uncommon := []string{"xyzabc", "qwertyuiop", "foobar"}
	for _, w := range uncommon {
		if isCommonWord(w) {
			t.Errorf("%q 不应为常见词", w)
		}
	}
}

// TestEstimateRawMessageTokens 测试 json.RawMessage 格式的消息估算
func TestEstimateRawMessageTokens(t *testing.T) {
	roles := []string{"user", "assistant"}
	contents := []json.RawMessage{
		json.RawMessage(`"Hello, how are you?"`),
		json.RawMessage(`"I am fine, thank you!"`),
	}

	tokens := EstimateRawMessageTokens(roles, contents)
	if tokens < 10 {
		t.Errorf("RawMessage tokens too low: %d", tokens)
	}
}

func TestEstimateRawMessageTokens_Empty(t *testing.T) {
	if EstimateRawMessageTokens(nil, nil) != 0 {
		t.Error("empty roles should return 0")
	}
}

func TestEstimateRawMessageTokens_NonStringJSON(t *testing.T) {
	// content is a JSON object, not a plain string
	roles := []string{"assistant"}
	contents := []json.RawMessage{
		json.RawMessage(`[{"type":"text","text":"hello"}]`),
	}
	tokens := EstimateRawMessageTokens(roles, contents)
	if tokens < 4 {
		t.Errorf("non-string JSON tokens too low: %d", tokens)
	}
}

// TestEstimateCostWithPricing 测试外部定价的成本估算
func TestEstimateCostWithPricing(t *testing.T) {
	// $3/M input, $15/M output
	cost := EstimateCostWithPricing(1_000_000, 100_000, 3.0, 15.0)
	expected := 3.0 + 1.5 // $3 input + $1.5 output
	if cost < expected-0.01 || cost > expected+0.01 {
		t.Errorf("cost = %f, want ~%f", cost, expected)
	}
}

func TestEstimateCostWithPricing_Zero(t *testing.T) {
	cost := EstimateCostWithPricing(0, 0, 3.0, 15.0)
	if cost != 0 {
		t.Errorf("zero tokens should cost 0, got %f", cost)
	}
}

// TestEstimateWordTokens_LongWord 测试长单词的 token 估算
func TestEstimateWordTokens_LongWord(t *testing.T) {
	// "implementation" is > 12 chars, should be 3+ tokens
	tokens := estimateWordTokens("implementation")
	if tokens < 2 {
		t.Errorf("long word tokens = %d, want >= 2", tokens)
	}
}

func TestEstimateWordTokens_Empty(t *testing.T) {
	if estimateWordTokens("") != 0 {
		t.Error("empty word should be 0 tokens")
	}
}

// TestIsCJK 测试 CJK 字符判断
func TestIsCJK_Tokenizer(t *testing.T) {
	if !isCJK('你') {
		t.Error("中文字符应为 CJK")
	}
	if !isCJK('の') { // 日文平假名
		t.Error("日文字符应为 CJK")
	}
	if isCJK('a') {
		t.Error("英文不应为 CJK")
	}
}