package shadowdb import ( "errors" "regexp" "strings" ) // ErrMissingSessionFilter is returned by EnforceSessionFilter when // the provided SQL does not contain a session_id=? filter (outside // of string literals and comments). Intended to signal a // three-layer-defense middle-tier rejection: the tool description // told the LLM to include the filter; EnforceSessionFilter // confirms it did; the DB column session_id NOT NULL + audit is the // last line. // // ErrMissingSessionFilter 在提供的 SQL 不含 session_id=? filter // (字符串字面与注释之外) 时返回. 表示 "三层防御" 中层拒绝: 工具 // 描述要求 LLM 带 filter; EnforceSessionFilter 复核; DB 的 // session_id NOT NULL + 审计是最后防线. var ErrMissingSessionFilter = errors.New("shadowdb: SQL missing session_id=? filter") // reSessionFilter matches session_id = ? with optional whitespace // and case-insensitive session_id. The \? is escaped. Callers who // use IN / BETWEEN / IS NULL on session_id should construct the // SQL such that it ALSO contains a session_id=? for the enforcement // check to pass, or use a platform-level validator with a looser // rule; shadowdb intentionally keeps the heuristic strict. // // reSessionFilter 匹配 session_id = ? 允许空白, session_id 本身 // 大小写不敏感. \? 已转义. 若用 IN / BETWEEN / IS NULL 作用于 // session_id, 应在 SQL 同时含 session_id=? 让本校验过; 或在平台层 // 自定义更宽松的 validator -- shadowdb 刻意保持启发严格. var reSessionFilter = regexp.MustCompile(`(?i)session_id\s*=\s*\?`) // EnforceSessionFilter returns nil if sql contains a session_id=? // filter outside string literals and comments, ErrMissingSessionFilter // otherwise. Case-insensitive on "session_id"; allows arbitrary // whitespace around the "=". // // Heuristic, not parser: strips single-quote strings, double-quote // strings (SQL standard identifier quoting -- MySQL's ANSI_QUOTES // mode), line comments (-- ...\n), and block comments (/* ... */), // then scans the residue for the regex. Does NOT defend against // adversarial constructions such as dynamic SQL assembly; the // goal is to catch casual LLM forgetfulness, not to prove // correctness. The DB column NOT NULL + audit is the last line. // // Scope limitation: recognises only "session_id = ?". SQL that // uses different forms (IN (?) / IS NULL / a column aliased to // session_id) will be rejected even if semantically correct. // Callers who need those forms bypass EnforceSessionFilter and // build a platform-layer validator. // // EnforceSessionFilter 在 sql 于字符串字面与注释之外包含 // session_id=? filter 时返回 nil, 否则返回 ErrMissingSessionFilter. // "session_id" 大小写不敏感; "=" 两侧允许任意空白. // // 启发式而非 parser: 剥离单引号字符串 / 双引号字符串 (SQL 标准 // 标识符引用 -- MySQL ANSI_QUOTES 模式) / 行注释 (-- ...\n) / // 块注释 (/* ... */), 对残留正则匹配. 不防动态 SQL 拼接等对抗性 // 构造; 目标是捕捉 LLM 随手遗漏, 不证正确性. DB 列 NOT NULL + // 审计是最后防线. // // 作用域限制: 只认 "session_id = ?". SQL 若用其他形态 (IN (?) / // IS NULL / 列别名为 session_id) 即便语义正确也会被拒. 需要这些 // 形态的调用方绕过 EnforceSessionFilter, 在平台层做自己的 validator. func EnforceSessionFilter(sql string) error { stripped := stripStringsAndComments(sql) if !reSessionFilter.MatchString(stripped) { return ErrMissingSessionFilter } return nil } // stripStringsAndComments removes single-quoted / double-quoted // strings and line / block comments from sql, replacing each // stripped region with a single space so token boundaries remain // intact for the downstream regex. Single-char scan, no regex -- // regex-based string stripping is notoriously fragile on backslash // escapes, doubled quotes, and nested comments. // // Quote-escape handling: // - SQL standard: a single quote inside a single-quoted string is // escaped by doubling ('it''s' is the string "it's"). We do // honour this (doubled quote closes then reopens, net effect: // still inside string). // - Backslash escapes (MySQL-ism, '\'' for "'"): NOT honoured. // A backslash in sqlite / PG standard mode is a literal char, // not an escape, so backslash-escape mode is out of scope for // a core helper. Platform-layer validators that target MySQL // with backslash-escape enabled must handle it separately. // // Block-comment handling: SQL does NOT nest block comments (unlike // PG's non-standard extension). We scan for /* ... */ pairs. // // stripStringsAndComments 从 sql 中剔除单引号 / 双引号字符串与 // 行 / 块注释, 每段剥离区替换为一个空格保持词边界不被粘合让 // 下游正则识别. 单字符扫描, 不用正则 -- 正则对反斜杠转义 / // 双引号 / 嵌套注释经常出错. // // 引号转义处理: // - SQL 标准: 单引号串中的单引号以双写转义 ('it''s' 是 "it's" // 串). 本函数遵守 (双引号先闭再开, 净效果: 仍在串内). // - 反斜杠转义 (MySQL 约定, '\'' 表 "'"): 不处理. sqlite / PG // 标准模式下反斜杠是字面字符不转义, 故 backslash-escape 模式 // 超出 core 辅助函数作用域. 目标 MySQL 启用反斜杠转义的平台 // 层 validator 必须自行处理. // // 块注释处理: SQL 的块注释标准不嵌套 (PG 非标准扩展例外), 本函数 // 按 /* ... */ 配对扫描. func stripStringsAndComments(sql string) string { var b strings.Builder b.Grow(len(sql)) i := 0 n := len(sql) for i < n { c := sql[i] switch { case c == '\'': // single-quoted string -- advance to matching ' j := i + 1 for j < n { if sql[j] == '\'' { if j+1 < n && sql[j+1] == '\'' { // SQL-standard doubled quote escape j += 2 continue } break } j++ } b.WriteByte(' ') if j < n { i = j + 1 } else { i = n } case c == '"': // double-quoted string (SQL standard identifier quoting) j := i + 1 for j < n { if sql[j] == '"' { if j+1 < n && sql[j+1] == '"' { j += 2 continue } break } j++ } b.WriteByte(' ') if j < n { i = j + 1 } else { i = n } case c == '-' && i+1 < n && sql[i+1] == '-': // line comment -- to end-of-line j := i + 2 for j < n && sql[j] != '\n' { j++ } b.WriteByte(' ') i = j case c == '/' && i+1 < n && sql[i+1] == '*': // block comment /* ... */ j := i + 2 for j+1 < n { if sql[j] == '*' && sql[j+1] == '/' { break } j++ } b.WriteByte(' ') if j+1 < n { i = j + 2 } else { i = n } default: b.WriteByte(c) i++ } } return b.String() }