diff --git a/docs/postgres-compatibility.md b/docs/postgres-compatibility.md index 0d668ebb..0086af11 100644 --- a/docs/postgres-compatibility.md +++ b/docs/postgres-compatibility.md @@ -116,7 +116,7 @@ why. | date / time / timestamp / timestamptz / interval | ✅ | `types_test.go::TestTypesDateTime`; server `TestEncode/DecodeDate/Timestamp/Time/Interval` | incl. ancient dates, microseconds | | boolean | ✅ | `types_test.go::TestTypesBoolean`; server `TestEncode/DecodeBool` | all literal forms (t/f/yes/no/1/0) | | uuid | ✅ | `types_test.go::TestTypesUUID`; server `TestEncodeDecodeUUID` | | -| json / jsonb | ✅ | `types_test.go::TestTypesJSON`; server `TestEncodeJSON`/`TestEncodeBinaryJSON` | operators `-> ->> @> ?`; JSONPath `@?` skipped | +| json / jsonb | ✅ | `types_test.go::TestTypesJSON`; server `TestEncodeJSON`/`TestEncodeBinaryJSON`; server `TestJSONExtractDollarKey_RoundTrip`/`_ParameterRoundTrip` | operators `-> ->> @> ?`; `$`-prefixed property keys (`$ai_session_id`, `$group_0`) are rewritten to a quoted-member JSONPath so DuckDB looks them up literally instead of failing as a malformed path (literal keys statically, bound params — including `::text`-cast ones — via the type-aware `duckgres_json_extract_path` macro, which still array-indexes integer params); a `$.`/`$[`-prefixed key navigates as a DuckDB JSONPath even via `->>` (deliberate divergence from PG literal-key semantics, see `normalizeJSONPathKey`); JSONPath `@?` skipped | | arrays | ✅ | `types_test.go::TestTypesArray` | subscript, slice, concat, contains, ANY/ALL | | NULL handling (all types) | ✅ | `types_test.go::TestTypesNullHandling` | | | Casts (`CAST`, `::`, implicit) | ✅ | `types_test.go::TestTypesCasting` | | diff --git a/server/catalog.go b/server/catalog.go index 6b364a8d..e0db81ae 100644 --- a/server/catalog.go +++ b/server/catalog.go @@ -1535,6 +1535,39 @@ func initUtilityMacros(db *sql.DB, serverStartTime, processStartTime time.Time, // In standalone mode this equals control_plane_version(). During rolling updates // these may differ if the control plane has been upgraded but workers haven't yet. fmt.Sprintf(`CREATE OR REPLACE MACRO worker_version() AS '%s'`, strings.ReplaceAll(processVersion, "'", "''")), + + // duckgres_json_extract_path - normalize a JSON-extraction key/path at + // runtime so a Postgres property key like "$ai_session_id" or "$group_0" + // is not mis-parsed by DuckDB as a (malformed) JSONPath and rejected at + // bind time ("JSON path error near ..."). The transpiler wraps the path + // argument of json_extract[_string] in this macro when it is a bound + // parameter ($N) whose value is unknown at transpile time (literal string + // keys are rewritten statically by normalizeJSONPathKey in + // transpiler/transform/operators.go; the string rules below MUST match it). + // + // Because the bound value's TYPE is only known at execute time, the macro + // dispatches on typeof(k): + // - NULL -> NULL + // - integer type -> $[k] (Postgres `json -> int` / `->> int` + // array indexing; a bare integer would also + // index, but every CASE branch must return + // one type, so we emit a VARCHAR path) + // - "$." / "$[" prefix -> unchanged (already a valid JSONPath) + // - other "$"-prefixed key -> $."" (escape \ and " for the + // quoted member) + // - plain key -> unchanged (DuckDB looks it up literally; + // a string "0" stays an object-key lookup, + // matching Postgres text-vs-int semantics) + // The string branches cast k to VARCHAR so LIKE type-checks for every bound + // type (an integer k would otherwise fail to bind the `~~`/LIKE operator). + `CREATE OR REPLACE MACRO duckgres_json_extract_path(k) AS ( + CASE + WHEN k IS NULL THEN NULL + WHEN typeof(k) IN ('TINYINT','SMALLINT','INTEGER','BIGINT','HUGEINT','UTINYINT','USMALLINT','UINTEGER','UBIGINT','UHUGEINT') THEN '$[' || k::VARCHAR || ']' + WHEN k::VARCHAR LIKE '$.%' OR k::VARCHAR LIKE '$[%' THEN k::VARCHAR + WHEN k::VARCHAR LIKE '$%' THEN '$."' || replace(replace(k::VARCHAR, '\', '\\'), '"', '\"') || '"' + ELSE k::VARCHAR + END)`, } for _, m := range macros { diff --git a/server/json_extract_path_test.go b/server/json_extract_path_test.go new file mode 100644 index 00000000..04a961f1 --- /dev/null +++ b/server/json_extract_path_test.go @@ -0,0 +1,175 @@ +package server + +import ( + "database/sql" + "testing" + + "github.com/posthog/duckgres/transpiler" +) + +// TestJSONExtractDollarKey_RoundTrip is the end-to-end regression test for the +// production "Binder Error: JSON path error near 'ai_session_id'" failures: +// PostHog/HogQL property keys that begin with '$' ($ai_session_id, $group_0) +// must extract their value instead of being mis-parsed by DuckDB as a malformed +// JSONPath. These cases transpile through the full pipeline and execute against +// a real in-memory DuckDB seeded with initPgCatalog (which registers the +// duckgres_json_extract_path macro), asserting the actual extracted value. +func TestJSONExtractDollarKey_RoundTrip(t *testing.T) { + runTransformCases(t, []transformCase{ + { + // The exact broken shape: ->> with a '$'-prefixed literal key. + name: "dollar key via ->> extracts the value", + query: `SELECT ('{"$ai_session_id":"sess_abc"}'::json)->>'$ai_session_id'`, + want: "sess_abc", + }, + { + // Direct json_extract_string(...) with a '$'-prefixed literal key — + // the function-call form clients also send. + name: "dollar key via direct json_extract_string extracts the value", + query: `SELECT json_extract_string('{"$group_0":"team_42"}', '$group_0')`, + want: "team_42", + }, + { + // Chained arrows with '$'-prefixed keys at every step. + name: "chained dollar keys extract the nested value", + query: `SELECT ('{"$a":{"$b":"deep"}}'::json)->'$a'->>'$b'`, + want: "deep", + }, + { + // Regression guard: a plain key must still be a literal-key lookup. + name: "plain key is unaffected", + query: `SELECT json_extract_string('{"plain":"ok"}', 'plain')`, + want: "ok", + }, + { + // A '$.'/'$['-prefixed argument is a valid DuckDB JSONPath, so it is + // left to navigate. For a direct json_extract[_string] call this is + // unambiguously correct — the client wrote a DuckDB function and a + // DuckDB path. + name: "valid JSONPath in a direct call navigates", + query: `SELECT json_extract_string('{"a":{"b":"nested"}}', '$.a.b')`, + want: "nested", + }, + { + // KNOWN, DELIBERATE DIVERGENCE: in PostgreSQL `data ->> '$.a.b'` means + // the literal key named "$.a.b" (-> NULL here), but we keep DuckDB + // JSONPath semantics for '$.'/'$['-prefixed arrow keys too — consistent + // with direct calls and with the pre-existing #639 transpiler test. This + // only affects keys that are *also* valid JSONPaths; the reported bug + // keys ($ai_session_id, $group_0 — no dot) are unaffected and HogQL does + // not emit dotted JSONPath property keys. See normalizeJSONPathKey. + name: "dotted JSONPath via arrow navigates (documented divergence)", + query: `SELECT ('{"a":{"b":"nested"}}'::json)->>'$.a.b'`, + want: "nested", + }, + }) +} + +// TestJSONExtractDollarKey_ParameterRoundTrip covers the parameterized form of +// the production bug, where the JSON key arrives as a bound parameter ($1) whose +// value is unknown at transpile time. The transpiler wraps the path argument in +// the duckgres_json_extract_path() macro; here we bind '$ai_session_id' at +// execute time and confirm DuckDB returns the value instead of failing at bind +// time. A normal key bound to the same statement must still work, and an +// ordinary (non-path) string parameter must be untouched. +func TestJSONExtractDollarKey_ParameterRoundTrip(t *testing.T) { + db, err := sql.Open("duckdb", ":memory:") + if err != nil { + t.Fatalf("open duckdb: %v", err) + } + defer func() { _ = db.Close() }() + // initPgCatalog registers initUtilityMacros, including duckgres_json_extract_path. + if err := initPgCatalog(db, processStartTime, processStartTime, "dev", "dev"); err != nil { + t.Fatalf("initPgCatalog: %v", err) + } + // ConvertPlaceholders mirrors the extended-query protocol path that carries + // bound parameters. + tr := transpiler.New(transpiler.Config{ConvertPlaceholders: true}) + + const doc = `{"$ai_session_id":"sess_from_param","normal":"plain_value"}` + const arrDoc = `["zero","one","two"]` + + cases := []struct { + name string + query string + arg any + want string + }{ + { + name: "parameter holding a dollar key extracts the value (the prod bug)", + query: `SELECT json_extract_string('` + doc + `'::json, $1)`, + arg: "$ai_session_id", + want: "sess_from_param", + }, + { + name: "same statement with a normal key still works", + query: `SELECT json_extract_string('` + doc + `'::json, $1)`, + arg: "normal", + want: "plain_value", + }, + { + name: "arrow ->> with a parameter dollar key extracts the value", + query: `SELECT ('` + doc + `'::json)->>$1`, + arg: "$ai_session_id", + want: "sess_from_param", + }, + { + // P2 regression: a path param wrapped in an explicit ::text cast (added + // by some drivers) must still resolve, not bypass normalization. + name: "parameter dollar key with a ::text cast extracts the value", + query: `SELECT json_extract_string('` + doc + `'::json, $1::text)`, + arg: "$ai_session_id", + want: "sess_from_param", + }, + { + // P2 regression: an INTEGER-bound parameter must still index the array + // (Postgres `json ->> int`). The type-aware macro emits a $[i] path + // rather than mangling the integer into a string key. + name: "integer parameter indexes the array (arrow)", + query: `SELECT ('` + arrDoc + `'::json)->>$1`, + arg: 2, + want: "two", + }, + { + name: "integer parameter indexes the array (direct call)", + query: `SELECT json_extract_string('` + arrDoc + `'::json, $1)`, + arg: int64(0), + want: "zero", + }, + } + + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + res, err := tr.Transpile(tc.query) + if err != nil { + t.Fatalf("transpile %q: %v", tc.query, err) + } + if res.ParamCount != 1 { + t.Fatalf("transpile %q: ParamCount = %d, want 1", tc.query, res.ParamCount) + } + var got sql.NullString + if err := db.QueryRow(res.SQL, tc.arg).Scan(&got); err != nil { + t.Fatalf("exec %q (transpiled %q) arg=%v: %v", tc.query, res.SQL, tc.arg, err) + } + if !got.Valid || got.String != tc.want { + t.Fatalf("%q arg=%v = %v, want %q (transpiled %q)", tc.query, tc.arg, got, tc.want, res.SQL) + } + }) + } + + // An ordinary string parameter (not a JSON path argument) must be passed + // through verbatim — the fix must not globally rewrite string parameters. + t.Run("ordinary string parameter is not wrapped", func(t *testing.T) { + res, err := tr.Transpile("SELECT $1::varchar") + if err != nil { + t.Fatalf("transpile: %v", err) + } + var got sql.NullString + if err := db.QueryRow(res.SQL, "$ai_session_id").Scan(&got); err != nil { + t.Fatalf("exec (transpiled %q): %v", res.SQL, err) + } + if !got.Valid || got.String != "$ai_session_id" { + t.Fatalf("ordinary param = %v, want %q (transpiled %q)", got, "$ai_session_id", res.SQL) + } + }) +} diff --git a/tests/e2e-mw-dev/harness.sh b/tests/e2e-mw-dev/harness.sh index d348184f..54f74b13 100755 --- a/tests/e2e-mw-dev/harness.sh +++ b/tests/e2e-mw-dev/harness.sh @@ -299,6 +299,24 @@ pg_compat_functions() { # org password assert_compat "$1" "$2" ducklake "SELECT ('{\"a\":1,\"b\":2}'::jsonb @> '{\"a\":1}'::jsonb)::int::text" "1" "jsonb_contains" # #>> jsonpath text extraction. assert_compat "$1" "$2" ducklake "SELECT '{\"a\":{\"b\":1}}'::json #>> '{a,b}'" "1" "jsonb_path_text" + # '$'-prefixed property keys ($ai_session_id, $group_0): DuckDB reads a + # '$'-prefixed json_extract path as a (malformed) JSONPath and fails at bind + # time ("JSON path error near ..."); the transpiler rewrites such literal keys + # to the quoted-member form $."". Regression for the prod + # json_extract_string failures. Both the ->> arrow and the direct + # json_extract_string(...) call shapes (clients send both) are exercised. + assert_compat "$1" "$2" ducklake "SELECT '{\"\$ai_session_id\":\"sess9\"}'::json ->> '\$ai_session_id'" "sess9" "json_dollar_key_arrow" + assert_compat "$1" "$2" ducklake "SELECT json_extract_string('{\"\$group_0\":\"team7\"}', '\$group_0')" "team7" "json_dollar_key_func" + # duckgres_json_extract_path is the runtime normalization macro the transpiler + # wraps around bound-parameter ($N) JSON paths (whose value is unknown at + # transpile time). Calling it directly asserts it is registered AND + # memory.main-qualified on the live DuckLake worker — the path a parameterized + # `json_extract_string(props, $1)` from a HogQL client takes. + assert_compat "$1" "$2" ducklake "SELECT json_extract_string('{\"\$ai_session_id\":\"viaMacro\"}', duckgres_json_extract_path('\$ai_session_id'))" "viaMacro" "json_dollar_key_macro" + # The macro is type-aware: an integer path argument (Postgres `json ->> int` + # array indexing) becomes a $[i] JSONPath rather than being mangled into a + # string key. Guards the parameterized-array-index path on the live worker. + assert_compat "$1" "$2" ducklake "SELECT json_extract_string('[\"a\",\"b\",\"c\"]', duckgres_json_extract_path(2))" "c" "json_int_index_macro" # PG curly-brace array literal cast. assert_compat "$1" "$2" ducklake "SELECT array_length('{1,2,3}'::int[],1)::text" "3" "array_literal_cast" # set-returning table macro in FROM position (memory.main-qualified). diff --git a/transpiler/transform/operators.go b/transpiler/transform/operators.go index 69de6484..2612fc41 100644 --- a/transpiler/transform/operators.go +++ b/transpiler/transform/operators.go @@ -9,12 +9,25 @@ import ( // OperatorTransform converts PostgreSQL operators to DuckDB equivalents. // Handles JSON operators, regex operators, and other PostgreSQL-specific operators. -type OperatorTransform struct{} +type OperatorTransform struct { + // qualifyMacros mirrors the catalog policy's QualifyMacros (true for + // DuckLake/Iceberg backends). When set, the duckgres_json_extract_path macro + // call we emit for parameterized JSON paths is qualified as + // memory.main.duckgres_json_extract_path so it resolves even though the + // default catalog is the lake catalog. See jsonPathMacroCall. + qualifyMacros bool +} func NewOperatorTransform() *OperatorTransform { return &OperatorTransform{} } +// NewOperatorTransformWithConfig builds an OperatorTransform that qualifies the +// custom macros it emits when qualifyMacros is set (DuckLake/Iceberg backends). +func NewOperatorTransformWithConfig(qualifyMacros bool) *OperatorTransform { + return &OperatorTransform{qualifyMacros: qualifyMacros} +} + func (t *OperatorTransform) Name() string { return "operators" } @@ -470,6 +483,17 @@ func (t *OperatorTransform) transformExpression(node *pg_query.Node) *pg_query.N anyChanged = true } } + // Direct json_extract / json_extract_string calls (clients send these, + // and FunctionTransform rewrites json_extract_path* into them earlier in + // the pipeline): normalize the key/path argument so a '$'-prefixed + // Postgres key like "$ai_session_id" isn't mis-parsed as a JSONPath. The + // arrow operators are handled in createJsonExtractFuncCall instead. + if isJSONExtractFunc(funcCall) && len(funcCall.Args) == 2 { + if newPath := t.normalizeJSONExtractPathArg(funcCall.Args[1]); newPath != nil { + funcCall.Args[1] = newPath + anyChanged = true + } + } if funcCall.AggFilter != nil { if newFilter := t.transformExpression(funcCall.AggFilter); newFilter != nil { funcCall.AggFilter = newFilter @@ -625,6 +649,13 @@ func (t *OperatorTransform) createJsonExtractFuncCall(left, right *pg_query.Node left = newLeft } + // Normalize the key/path so a Postgres property key like "$ai_session_id" + // isn't mis-parsed as a (malformed) JSONPath by DuckDB. See + // normalizeJSONExtractPathArg. + if newRight := t.normalizeJSONExtractPathArg(right); newRight != nil { + right = newRight + } + funcName := "json_extract" if asText { funcName = "json_extract_string" @@ -642,6 +673,150 @@ func (t *OperatorTransform) createJsonExtractFuncCall(left, right *pg_query.Node } } +// jsonExtractPathMacro is the DuckDB scalar macro that normalizes a JSON +// extraction key/path at runtime. It is registered in initUtilityMacros +// (server/catalog.go) so it is available on every backend (standalone, process +// workers, k8s workers). We emit a call to it to wrap *dynamic* path arguments +// (bound parameters) that cannot be rewritten at transpile time. Keep the name +// and the macro body's normalization rules in sync with normalizeJSONPathKey. +const jsonExtractPathMacro = "duckgres_json_extract_path" + +// normalizeJSONPathKey rewrites a *literal* JSON-extraction key so DuckDB looks +// it up as the intended Postgres key instead of mis-parsing it as a JSONPath. +// +// DuckDB's json_extract[_string](doc, path) treats `path` as a JSONPath only +// when it starts with '$'; otherwise the whole string is a single literal-key +// lookup (dots, brackets and quotes included, so those need no special care). A +// '$'-prefixed string is a valid JSONPath only when '$' is followed by '.' or +// '[', so a Postgres property key like "$ai_session_id" or "$group_0" is parsed +// as a malformed path and DuckDB fails at bind time ("JSON path error near +// ..."). We rewrite such keys to an explicit quoted-member path, $."", +// which addresses the literal key. +// +// Returns (newKey, true) when a rewrite happened, (key, false) otherwise. Plain +// keys and already-valid navigating JSONPaths ($.foo, $."foo", $.a[0], $[0]) are +// returned unchanged, so the rewrite is idempotent. +// +// KNOWN DIVERGENCE: a '$.'/'$['-prefixed key is treated as a navigating DuckDB +// JSONPath for BOTH direct json_extract calls and the -> / ->> operators, even +// though in PostgreSQL `data ->> '$.a.b'` means the literal key "$.a.b". We keep +// this consistent (and matching the pre-existing #639 transpiler test) rather +// than giving arrows separate literal-key semantics, because (a) a single +// runtime macro cannot both preserve a valid path and quote it, so arrow params +// could not match arrow literals, and (b) the reported keys ($ai_session_id, +// $group_0 — no dot) and HogQL property keys never take this branch. +func normalizeJSONPathKey(key string) (string, bool) { + if !strings.HasPrefix(key, "$") { + // Plain key: DuckDB does a literal single-key lookup. Already correct. + return key, false + } + if strings.HasPrefix(key, "$.") || strings.HasPrefix(key, "$[") { + // Already a valid navigating JSONPath ($.foo, $."foo", $.a[0], $[0]). + return key, false + } + // '$' followed by neither '.' nor '[' (e.g. "$ai_session_id", "$group_0", or + // a lone "$"): not a valid JSONPath, but a valid Postgres literal key. + // Address it explicitly as a quoted member. Inside a DuckDB JSONPath quoted + // member, '\' and '"' are backslash-escaped (verified against DuckDB 1.5). + escaped := strings.ReplaceAll(key, `\`, `\\`) + escaped = strings.ReplaceAll(escaped, `"`, `\"`) + return `$."` + escaped + `"`, true +} + +// normalizeJSONExtractPathArg normalizes the key/path argument of a JSON +// extraction so a '$'-prefixed Postgres key doesn't trip DuckDB's JSONPath +// parser. It returns a replacement node, or nil if the argument needs no change. +// +// - A string-literal key is rewritten at transpile time (normalizeJSONPathKey). +// - A bound parameter ($N), whose value is only known at execute time, is +// wrapped in the duckgres_json_extract_path() macro so the identical +// normalization runs at runtime, before DuckDB's binder sees the value. The +// parameter node still appears exactly once, so param count/position is +// unchanged. The macro is type-aware, so an integer-bound parameter still +// indexes arrays (Postgres `json -> int`). +// +// An explicit cast is looked through (clients/drivers commonly add ::text to a +// path argument, e.g. json_extract_string(p, $1::text), and a bare ParamRef +// check would miss it). The whole casted expression is wrapped/rewritten so the +// cast — and the parameter's wire type — is preserved. +// +// Any other argument shape (bare integer array index, column reference, +// arbitrary expression) is deliberately left untouched: an integer literal is a +// valid DuckDB array index, and we must not blanket-rewrite non-path arguments. +func (t *OperatorTransform) normalizeJSONExtractPathArg(node *pg_query.Node) *pg_query.Node { + if node == nil { + return nil + } + // Look through any wrapping casts to find the underlying param/literal. + core := node + for { + tc := core.GetTypeCast() + if tc == nil || tc.Arg == nil { + break + } + core = tc.Arg + } + // Bound parameter (bare or cast-wrapped) → wrap the whole expression in the + // runtime macro. + if core.GetParamRef() != nil { + return t.jsonPathMacroCall(node) + } + // Literal string key → rewrite at transpile time. Mutate the inner constant + // in place so an enclosing cast (e.g. '$ai_session_id'::text) is preserved. + if ac := core.GetAConst(); ac != nil { + if sval := ac.GetSval(); sval != nil { + if newKey, changed := normalizeJSONPathKey(sval.Sval); changed { + ac.Val = &pg_query.A_Const_Sval{Sval: &pg_query.String{Sval: newKey}} + return node + } + } + } + return nil +} + +// jsonPathMacroCall builds a call to the duckgres_json_extract_path macro +// wrapping arg. In DuckLake/Iceberg mode (qualifyMacros) the name is emitted as +// memory.main.duckgres_json_extract_path: the macro is created in memory.main +// but the default catalog is the lake catalog, so the call must be explicitly +// qualified. The PgCatalogTransform qualifies other custom macros, but it runs +// earlier in the pipeline and never sees this emitted call, so we qualify here. +func (t *OperatorTransform) jsonPathMacroCall(arg *pg_query.Node) *pg_query.Node { + var funcname []*pg_query.Node + if t.qualifyMacros { + funcname = []*pg_query.Node{ + {Node: &pg_query.Node_String_{String_: &pg_query.String{Sval: "memory"}}}, + {Node: &pg_query.Node_String_{String_: &pg_query.String{Sval: "main"}}}, + {Node: &pg_query.Node_String_{String_: &pg_query.String{Sval: jsonExtractPathMacro}}}, + } + } else { + funcname = []*pg_query.Node{ + {Node: &pg_query.Node_String_{String_: &pg_query.String{Sval: jsonExtractPathMacro}}}, + } + } + return &pg_query.Node{Node: &pg_query.Node_FuncCall{FuncCall: &pg_query.FuncCall{ + Funcname: funcname, + Args: []*pg_query.Node{arg}, + }}} +} + +// isJSONExtractFunc reports whether fc is a json_extract / json_extract_string +// call (the DuckDB forms our pipeline and clients emit). The last funcname +// element is checked so a schema-qualified name still matches. +func isJSONExtractFunc(fc *pg_query.FuncCall) bool { + if fc == nil || len(fc.Funcname) == 0 { + return false + } + last := fc.Funcname[len(fc.Funcname)-1].GetString_() + if last == nil { + return false + } + switch last.Sval { + case "json_extract", "json_extract_string": + return true + } + return false +} + // looksJSON reports whether a node is syntactically JSON: a cast to json/jsonb, // a JSON-returning json* function call (including the json_extract calls this // transform produces from -> / ->>), a JSON-producing conversion function diff --git a/transpiler/transform/pgcatalog.go b/transpiler/transform/pgcatalog.go index 5d3d322b..1ae7f3af 100644 --- a/transpiler/transform/pgcatalog.go +++ b/transpiler/transform/pgcatalog.go @@ -220,6 +220,7 @@ func NewPgCatalogTransformWithConfig(duckLakeMode bool) *PgCatalogTransform { "worker_uptime": true, // Current process uptime as INTERVAL "control_plane_version": true, // Server/control-plane version "worker_version": true, // Worker process version + "duckgres_json_extract_path": true, // Normalize '$'-prefixed JSON keys (emitted by OperatorTransform for param paths) "div": true, // Integer division "array_remove": true, // Remove element from array "array_lower": true, // PostgreSQL array lower-bound compatibility diff --git a/transpiler/transpiler.go b/transpiler/transpiler.go index fe9a0a51..c981b9cc 100644 --- a/transpiler/transpiler.go +++ b/transpiler/transpiler.go @@ -124,7 +124,7 @@ func New(cfg Config) *Transpiler { t.transforms = append(t.transforms, taggedTransform{FlagBooleanPredicates, transform.NewBooleanPredicateTransform()}) // 9. Operator mappings (regex operators, etc.) - t.transforms = append(t.transforms, taggedTransform{FlagOperators, transform.NewOperatorTransform()}) + t.transforms = append(t.transforms, taggedTransform{FlagOperators, transform.NewOperatorTransformWithConfig(catalogPolicy.QualifyMacros)}) // 10. SET/SHOW command handling t.transforms = append(t.transforms, taggedTransform{FlagSetShow, transform.NewSetShowTransform()}) @@ -569,6 +569,13 @@ func Classify(sql string, cfg Config) Classification { if strings.Contains(upper, "COLLATE") { flags |= FlagOperators | FlagPgCatalog } + // Direct json_extract / json_extract_string calls (no arrow operator) also + // need the operator transform, which normalizes '$'-prefixed Postgres keys + // (e.g. "$ai_session_id") that DuckDB would otherwise mis-parse as a + // JSONPath and reject at bind time. The arrow check above misses these. + if strings.Contains(upper, "JSON_EXTRACT") { + flags |= FlagOperators + } // FOR UPDATE/SHARE/NO KEY UPDATE/KEY SHARE locking clauses (upperNorm so // newline/tab-separated spellings like "FOR\nUPDATE" are caught too) diff --git a/transpiler/transpiler_test.go b/transpiler/transpiler_test.go index b2c652b2..e0b867b0 100644 --- a/transpiler/transpiler_test.go +++ b/transpiler/transpiler_test.go @@ -2469,6 +2469,225 @@ func TestTranspile_JSONOperators_CreateTableAs(t *testing.T) { } } +// TestTranspile_JSONExtract_DollarPrefixedKeys is the regression test for the +// production "Binder Error: JSON path error near 'ai_session_id'" failures. +// +// PostHog/HogQL property keys frequently start with '$' ($ai_session_id, +// $group_0, ...). Postgres `properties ->> '$ai_session_id'` means "extract the +// value at the literal key named $ai_session_id". But DuckDB's +// json_extract[_string] treats a '$'-prefixed second argument as a JSONPath, +// and "$ai_session_id" is not a valid JSONPath ('$' must be followed by '.' or +// '['), so DuckDB fails at bind time. The fix rewrites such literal keys to an +// explicit quoted-member path: $."$ai_session_id". +// +// Keys that are already valid navigating JSONPaths ($.foo, $."foo", $.a[0], +// $[0]) and plain keys (key) — which DuckDB looks up literally — must be left +// untouched. +func TestTranspile_JSONExtract_DollarPrefixedKeys(t *testing.T) { + tr := New(DefaultConfig()) + + tests := []struct { + name string + input string + expected string + }{ + // --- The broken production cases --- + { + name: "dollar-prefixed key via ->> is wrapped as quoted member", + input: "SELECT properties->>'$ai_session_id' FROM events", + expected: `SELECT json_extract_string(properties, '$."$ai_session_id"') FROM events`, + }, + { + name: "group_0 key via ->> is wrapped", + input: "SELECT properties->>'$group_0' FROM events", + expected: `SELECT json_extract_string(properties, '$."$group_0"') FROM events`, + }, + { + name: "dollar-prefixed key via -> (json) is wrapped", + input: "SELECT data->'$ai_session_id' FROM t", + expected: `SELECT json_extract(data, '$."$ai_session_id"') FROM t`, + }, + { + name: "lone dollar key is wrapped", + input: "SELECT data->>'$' FROM t", + expected: `SELECT json_extract_string(data, '$."$"') FROM t`, + }, + // --- Must be preserved: normal keys --- + { + name: "plain key is left as a literal key", + input: "SELECT data->>'key' FROM t", + expected: "SELECT json_extract_string(data, 'key') FROM t", + }, + { + name: "plain key with a dot is left literal (DuckDB looks it up literally)", + input: "SELECT data->>'foo.bar' FROM t", + expected: "SELECT json_extract_string(data, 'foo.bar') FROM t", + }, + // --- Must be preserved: already-valid JSONPaths --- + { + name: "valid dotted JSONPath is preserved", + input: "SELECT data->>'$.foo' FROM t", + expected: "SELECT json_extract_string(data, '$.foo') FROM t", + }, + { + name: "valid quoted-member JSONPath is preserved (idempotent)", + input: `SELECT data->>'$."foo"' FROM t`, + expected: `SELECT json_extract_string(data, '$."foo"') FROM t`, + }, + { + name: "valid array-index JSONPath is preserved", + input: "SELECT data->>'$.a[0]' FROM t", + expected: "SELECT json_extract_string(data, '$.a[0]') FROM t", + }, + { + name: "valid root-bracket JSONPath is preserved", + input: "SELECT data->'$[0]' FROM t", + expected: "SELECT json_extract(data, '$[0]') FROM t", + }, + // --- Direct json_extract_string(...) func call (the exact production shape) --- + { + name: "direct json_extract_string with dollar literal key is wrapped", + input: "SELECT json_extract_string(events.properties, '$ai_session_id') AS ai_session_id FROM events", + expected: `SELECT json_extract_string(events.properties, '$."$ai_session_id"') AS ai_session_id FROM events`, + }, + { + name: "direct json_extract with valid path is preserved", + input: "SELECT json_extract(data, '$.a') FROM t", + expected: "SELECT json_extract(data, '$.a') FROM t", + }, + { + // A literal key wrapped in an explicit ::text cast: the cast must be + // looked through and the inner literal rewritten, preserving the cast. + name: "dollar literal key wrapped in a text cast is rewritten", + input: "SELECT json_extract_string(props, '$ai_session_id'::text) FROM events", + expected: `SELECT json_extract_string(props, '$."$ai_session_id"'::text) FROM events`, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + result, err := tr.Transpile(tt.input) + if err != nil { + t.Fatalf("Transpile(%q) error: %v", tt.input, err) + } + if result.SQL != tt.expected { + t.Errorf("Transpile(%q)\n got: %q\n want: %q", tt.input, result.SQL, tt.expected) + } + }) + } +} + +// TestTranspile_JSONExtract_ParameterPath covers the parameterized form of the +// production bug: the JSON key arrives as a bound parameter ($1) whose value is +// unknown at transpile time (e.g. `properties ->> $1` with $1 = '$ai_session_id' +// at execute). We cannot rewrite the literal, so the path argument is wrapped in +// the duckgres_json_extract_path() macro, which normalizes the key to a valid +// DuckDB JSONPath at runtime — before DuckDB's binder sees it. +// +// Only JSON-extraction path arguments are wrapped; ordinary string parameters +// must be left completely alone. +func TestTranspile_JSONExtract_ParameterPath(t *testing.T) { + tr := New(Config{ConvertPlaceholders: true}) + + tests := []struct { + name string + input string + expected string + }{ + { + name: "arrow ->> with parameter path is wrapped in the macro", + input: "SELECT properties->>$1 FROM events", + expected: "SELECT json_extract_string(properties, duckgres_json_extract_path($1)) FROM events", + }, + { + name: "arrow -> (json) with parameter path is wrapped in the macro", + input: "SELECT data->$1 FROM t", + expected: "SELECT json_extract(data, duckgres_json_extract_path($1)) FROM t", + }, + { + name: "direct json_extract_string with parameter path is wrapped (production shape)", + input: "SELECT json_extract_string(events.properties, $1) AS ai_session_id FROM events", + expected: "SELECT json_extract_string(events.properties, duckgres_json_extract_path($1)) AS ai_session_id FROM events", + }, + { + // A param wrapped in an explicit ::text cast (clients/drivers add + // these) must still be wrapped — the cast is preserved inside the macro + // call so the param's wire type is unchanged. + name: "direct func param with ::text cast is wrapped", + input: "SELECT json_extract_string(props, $1::text) FROM events", + expected: "SELECT json_extract_string(props, duckgres_json_extract_path($1::text)) FROM events", + }, + { + name: "arrow ->> param with ::text cast is wrapped", + input: "SELECT props->>$1::text FROM events", + expected: "SELECT json_extract_string(props, duckgres_json_extract_path($1::text)) FROM events", + }, + { + name: "ordinary string parameter is NOT wrapped", + input: "SELECT * FROM events WHERE name = $1", + expected: "SELECT * FROM events WHERE name = $1", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + result, err := tr.Transpile(tt.input) + if err != nil { + t.Fatalf("Transpile(%q) error: %v", tt.input, err) + } + if result.SQL != tt.expected { + t.Errorf("Transpile(%q)\n got: %q\n want: %q", tt.input, result.SQL, tt.expected) + } + if result.ParamCount != 1 { + t.Errorf("Transpile(%q) ParamCount = %d, want 1 (wrapping must not change param count)", tt.input, result.ParamCount) + } + }) + } +} + +// TestTranspile_JSONExtract_ParameterPath_DuckLake verifies that in DuckLake +// mode (the worker/remote backend) the duckgres_json_extract_path macro call is +// emitted memory.main-qualified. The macro is created in memory.main but the +// default catalog is the lake catalog, so an unqualified call would fail to +// resolve. The PgCatalogTransform that qualifies other custom macros runs before +// the operator transform and never sees this emitted call. +func TestTranspile_JSONExtract_ParameterPath_DuckLake(t *testing.T) { + tr := New(Config{ConvertPlaceholders: true, DuckLakeMode: true}) + + tests := []struct { + name string + input string + expected string + }{ + { + name: "arrow ->> param path is wrapped in the qualified macro", + input: "SELECT properties->>$1 FROM events", + expected: "SELECT json_extract_string(properties, memory.main.duckgres_json_extract_path($1)) FROM events", + }, + { + name: "direct json_extract_string param path is wrapped in the qualified macro", + input: "SELECT json_extract_string(properties, $1) FROM events", + expected: "SELECT json_extract_string(properties, memory.main.duckgres_json_extract_path($1)) FROM events", + }, + { + name: "param path with a ::text cast is wrapped in the qualified macro", + input: "SELECT json_extract_string(properties, $1::text) FROM events", + expected: "SELECT json_extract_string(properties, memory.main.duckgres_json_extract_path($1::text)) FROM events", + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + result, err := tr.Transpile(tt.input) + if err != nil { + t.Fatalf("Transpile(%q) error: %v", tt.input, err) + } + if result.SQL != tt.expected { + t.Errorf("Transpile(%q)\n got: %q\n want: %q", tt.input, result.SQL, tt.expected) + } + }) + } +} + func TestTranspile_ExpandArray(t *testing.T) { tests := []struct { name string