// vision.go implements the flyto.VisionProvider capability for
// MiniMax via its vendor-specific /v1/coding_plan/vlm endpoint.
//
// The endpoint is NOT Anthropic-compatible (rejects multi-turn
// messages, rejects content blocks) and NOT OpenAI-compatible (no
// chat/completions shape) -- it is a single-shot {prompt, image_url}
// in / {content, base_resp} out RPC. Mode (Native/OpenAI/Anthropic)
// has no effect here; vlm shares only the APIKey and baseURL Region
// (China / Global) with the streaming modes.
//
// Authentication uses a Bearer header (NOT x-api-key as Anthropic
// mode uses). The endpoint accepts the same MINIMAX_TOKEN_PLAN_KEY.
//
// Threading: stateless per call; safe to share Provider across
// goroutines (HTTP client is the only mutable state, and http.Client
// is itself goroutine-safe).
//
// vision.go 用 MiniMax 自家 /v1/coding_plan/vlm 端点实现
// flyto.VisionProvider 能力.
//
// 端点既不 Anthropic-compat (拒多轮 messages / 拒 content block) 也
// 不 OpenAI-compat (无 chat/completions 形态) -- 是单次 {prompt,
// image_url} in / {content, base_resp} out RPC. Mode (Native /
// OpenAI / Anthropic) 对此无效; vlm 仅与 streaming mode 共享 APIKey
// 和 baseURL Region (China / Global).
//
// 鉴权用 Bearer header (Anthropic mode 用 x-api-key, 这里**不**用).
// 端点接受同一 MINIMAX_TOKEN_PLAN_KEY.
//
// 并发: 每次调用无状态; Provider 可跨 goroutine 共享 (唯一可变状态是
// http.Client, http.Client 自身 goroutine-safe).

package minimax

import (
	"bytes"
	"context"
	"encoding/base64"
	"encoding/json"
	"errors"
	"fmt"
	"io"
	"net/http"
)

// VisionRequest is a single-shot image extraction request to MiniMax
// vlm. Image is raw bytes (NOT base64); ExtractVision encodes it as
// a data URI for the wire. MaxTokens is reserved for future endpoint
// support -- /v1/coding_plan/vlm currently has no max_tokens knob,
// the field is accepted but ignored.
//
// Why this lives in the minimax package (not in flyto): vlm is a
// MiniMax-specific endpoint (FAQ-confirmed: M2 chat completions and
// the Anthropic-compat /v1/messages reject mixed text-image input;
// the only vision path is /v1/coding_plan/vlm). Other mainstream
// LLM providers (Claude / GPT-4o / Gemini) carry images inside
// messages as `BlockImage` content blocks via the streaming Provider
// interface. Putting a vision contract on flyto.* would pollute the
// cross-provider core for one outlier; per rule-of-two it stays
// here until a second provider needs the same vendor-specific
// shape.
//
// VisionRequest 是发到 MiniMax vlm 的 single-shot 图像抽取请求.
// Image 原始字节 (非 base64); ExtractVision 编码为 data URI 上线.
// MaxTokens 为未来端点扩展预留 -- /v1/coding_plan/vlm 当前无
// max_tokens 旋钮, 字段被接收但忽略.
//
// 为什么放在 minimax 包而不是 flyto: vlm 是 MiniMax 特有端点 (FAQ
// 明确: M2 chat completions 与 Anthropic-compat /v1/messages 拒
// mixed text-image 输入; 唯一 vision 路径就是 /v1/coding_plan/vlm).
// 其他主流 LLM (Claude / GPT-4o / Gemini) 通过 streaming Provider
// 接口的 BlockImage content 块送图. 把 vision 契约放 flyto.* 会让
// 跨 provider 核心被一个 outlier 污染; 按 rule of two 留在 minimax,
// 等第二个 provider 需要同款形态再 promote.
type VisionRequest struct {
	Prompt    string // instructions for the model
	Image     []byte // raw image bytes
	MediaType string // e.g. "image/png", "image/jpeg"; empty defaults to "image/png"
	MaxTokens int    // reserved; vlm endpoint currently has no token cap
}

// VisionResponse carries the model's extracted content. Content is
// the raw string the model returned; the consumer parses (typically
// JSON when prompt asked for structured output).
//
// VisionResponse 携带模型抽取内容. Content 是模型返的原始字符串;
// 消费者解析 (prompt 要求结构化输出时通常 JSON).
type VisionResponse struct {
	Content string
}

// VisionClient is the local capability interface for MiniMax vision
// extraction. *Provider satisfies it (see ExtractVision below).
// Consumers depend on this interface (not *Provider) to allow test
// injection without an httptest server. Lives in the minimax
// package by design (not flyto.*) -- see VisionRequest doc for the
// full rationale.
//
// VisionClient 是 MiniMax 视觉抽取的本地 capability 接口. *Provider
// 满足该接口 (见下方 ExtractVision). 消费者依赖此接口 (非 *Provider)
// 以便测试时注入而不必起 httptest server. 故意放 minimax 包不放
// flyto.* -- 完整理由见 VisionRequest 注释.
type VisionClient interface {
	ExtractVision(ctx context.Context, req *VisionRequest) (*VisionResponse, error)
}

// visionEndpointPath is the path appended to baseURL (Region-derived)
// for the vlm RPC. Same path for China / Global regions.
//
// visionEndpointPath 是追加到 baseURL (按 Region 推导) 的路径. China /
// Global 路径相同.
const visionEndpointPath = "/v1/coding_plan/vlm"

// defaultVisionMediaType applies when caller leaves VisionRequest.MediaType
// blank. PNG is the safest default for screenshots; JPEG callers must
// set MediaType explicitly.
//
// defaultVisionMediaType 当调用方 VisionRequest.MediaType 留空时生效.
// PNG 是截图的最安全默认; JPEG 调用方必须显式设 MediaType.
const defaultVisionMediaType = "image/png"

// visionWireRequest mirrors the MiniMax vlm wire shape. Two fields,
// no model selector (vlm endpoint is endpoint-locked to one model
// version on the MiniMax side; the model rolls forward without API
// changes).
//
// visionWireRequest 镜像 MiniMax vlm 线协议形态. 两字段, 无 model
// 选择器 (vlm 端点锁定一个 model 版本, MiniMax 侧滚动升级不动 API).
type visionWireRequest struct {
	Prompt   string `json:"prompt"`
	ImageURL string `json:"image_url"` // data:image/<type>;base64,<bytes> URI
}

// visionWireResponse mirrors the MiniMax vlm response envelope. Only
// Content + BaseResp are consumed; future fields are silently
// ignored (Go encoding/json default).
//
// visionWireResponse 镜像 MiniMax vlm 响应外壳. 只消费 Content +
// BaseResp; 未来字段静默忽略 (Go encoding/json 默认行为).
type visionWireResponse struct {
	Content  string `json:"content"`
	BaseResp struct {
		StatusCode int    `json:"status_code"`
		StatusMsg  string `json:"status_msg"`
	} `json:"base_resp"`
}

// ExtractVision implements flyto.VisionProvider. Single HTTP POST,
// blocks until the full response body is read. Errors propagate from
// transport failures, non-2xx HTTP status, non-zero
// base_resp.status_code (MiniMax error envelope), or empty content
// (caller cannot parse "" so we fail loudly).
//
// ExtractVision 实现 flyto.VisionProvider. 单次 HTTP POST, 阻塞至
// 读完响应 body. 错误来源: 传输失败 / 非 2xx HTTP / 非零
// base_resp.status_code (MiniMax 错误外壳) / 空 content (调用方解析
// "" 没意义, 这里 fail-loud).
func (p *Provider) ExtractVision(ctx context.Context, req *VisionRequest) (*VisionResponse, error) {
	if req == nil {
		return nil, errors.New("minimax: nil VisionRequest")
	}
	if len(req.Image) == 0 {
		return nil, errors.New("minimax: empty image bytes")
	}
	if p.cfg.APIKey == "" {
		return nil, errors.New("minimax: APIKey required for vision")
	}

	mediaType := req.MediaType
	if mediaType == "" {
		mediaType = defaultVisionMediaType
	}
	dataURI := "data:" + mediaType + ";base64," + base64.StdEncoding.EncodeToString(req.Image)

	body, err := json.Marshal(visionWireRequest{
		Prompt:   req.Prompt,
		ImageURL: dataURI,
	})
	if err != nil {
		return nil, fmt.Errorf("minimax: marshal vision request: %w", err)
	}

	url := p.baseURL + visionEndpointPath
	httpReq, err := http.NewRequestWithContext(ctx, http.MethodPost, url, bytes.NewReader(body))
	if err != nil {
		return nil, fmt.Errorf("minimax: new vision request: %w", err)
	}
	httpReq.Header.Set("Content-Type", "application/json")
	httpReq.Header.Set("Authorization", "Bearer "+p.cfg.APIKey)

	resp, err := p.visionHTTPClient().Do(httpReq)
	if err != nil {
		return nil, fmt.Errorf("minimax: do vision request: %w", err)
	}
	defer resp.Body.Close()

	respBody, err := io.ReadAll(resp.Body)
	if err != nil {
		return nil, fmt.Errorf("minimax: read vision response: %w", err)
	}
	if resp.StatusCode/100 != 2 {
		return nil, fmt.Errorf("minimax: vision status %d: %s", resp.StatusCode, visionSnippet(respBody, 400))
	}

	var vr visionWireResponse
	if err := json.Unmarshal(respBody, &vr); err != nil {
		return nil, fmt.Errorf("minimax: parse vision envelope: %w (body=%s)", err, visionSnippet(respBody, 400))
	}
	if vr.BaseResp.StatusCode != 0 {
		return nil, fmt.Errorf("minimax: vision base_resp status=%d msg=%q", vr.BaseResp.StatusCode, vr.BaseResp.StatusMsg)
	}
	if vr.Content == "" {
		return nil, fmt.Errorf("minimax: vision empty content (body=%s)", visionSnippet(respBody, 400))
	}
	return &VisionResponse{Content: vr.Content}, nil
}

// visionHTTPClient picks the http.Client for vision calls. Reuses
// cfg.HTTPClient if injected (test or custom transport); else builds
// a fresh one with cfg.Timeout (or defaultTimeout) applied as
// ResponseHeaderTimeout. Same semantics as Mode-based clients: the
// timeout caps "until first byte" not full body read, so large
// images / slow vlm processing still complete.
//
// visionHTTPClient 选 vision 调用的 http.Client. 若注入了
// cfg.HTTPClient (test 或 custom transport) 复用; 否则按 cfg.Timeout
// (或 defaultTimeout) 当 ResponseHeaderTimeout 新建. 与 Mode client
// 同语义: timeout 限"到首字节"非整 body 读, 大图 / 慢 vlm 处理
// 仍可完成.
func (p *Provider) visionHTTPClient() *http.Client {
	if p.cfg.HTTPClient != nil {
		return p.cfg.HTTPClient
	}
	timeout := p.cfg.Timeout
	if timeout == 0 {
		timeout = defaultTimeout
	}
	return &http.Client{
		Transport: &http.Transport{
			ResponseHeaderTimeout: timeout,
		},
	}
}

// visionSnippet bounds error messages so a 50 KB upstream HTML page
// does not flood logs. 400 chars is enough to identify the failure
// while keeping log lines bounded.
//
// visionSnippet 限 error 消息长度, 防 50 KB 上游 HTML 页面把 log 灌
// 满. 400 字符足以识别失败原因, 同时让 log 行长可控.
func visionSnippet(b []byte, n int) string {
	if len(b) <= n {
		return string(b)
	}
	return string(b[:n]) + "..."
}