diff --git a/CHANGELOG.md b/CHANGELOG.md index 626fac385..c92b43447 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,7 @@ Full release notes with details on each version: [GitHub Releases](https://githu ## Unreleased +- Feat: per-project `query` defaults for `--budget`/`--depth`, read from an optional `graphify-out/config.json` (#1654, thanks @Ns2384-star). Declare `{"query": {"default_budget": N, "default_depth": N}}` (flat `budget`/`depth` keys also accepted) to seed the query CLI's defaults before flag parsing, so a per-repo budget/depth becomes the norm without retyping it — and an explicit `--budget`/`--depth` flag still overrides. The `query` command also gains a `--depth` flag (it had none, forcing the old hardcoded depth 2), making the traversal depth tunable per invocation. A missing, unreadable, malformed, or ill-typed config silently degrades to the built-in defaults (budget 2000, depth 2) so a bad file never crashes a query. - Fix: a malformed semantic chunk no longer crashes `extract` and discards every successful chunk (#1631, thanks @ssazy). When an LLM returned a well-formed object whose `edges` (or `nodes`/`hyperedges`) array carried a stray non-dict entry — a nested list where an edge object belongs — the AST+semantic merge and the semantic-cache write both called `.get()` per entry and raised `AttributeError: 'list' object has no attribute 'get'`. On a 34-chunk run where 33 succeeded, that meant no `graph.json` was written and the cache write failed too, so a re-run re-extracted everything. `_parse_llm_json` now sanitizes each fragment at the single parse chokepoint (keeping only dict entries and coercing a non-list value to `[]`), so the cache writer, the adaptive-retry merge, and the CLI merge are all protected in one place. - Fix: an unresolved bare npm import no longer aliases onto an unrelated same-named local file (#1638, thanks @EveX1). `import colors from "tailwindcss/colors"` in a `.tsx` file emitted an `imports_from` edge to the bare id `colors`, and build.py's pre-migration alias index (which registers every local file's bare stem) then remapped it onto an unrelated `backend/utils/colors.py` — a confident (`EXTRACTED`) cross-language phantom edge, and one per `.tsx` file sharing the import. In a real monorepo eight unrelated `.tsx` files all landed on a single Python module. Common package subpaths (`colors`, `utils`, `types`, `config`, `client`) collide this way constantly. The external-import fallback now namespaces its target with the `ref` prefix (the same J-4 convention used for tsconfig `extends`/`$ref` externals), so it can never collapse to a local file/symbol id; the ref-namespaced target has no node, so build drops it as an external reference — the correct outcome for a third-party import. - Fix: `graph.json` node/edge ordering is now stable run-to-run for document/semantic corpora (#1632, thanks @umeshpsatwe). With a parallel LLM backend, `extract_corpus_parallel` merged chunk results in completion order, so which network call happened to return first reordered the nodes and edges even when the model returned identical content — churning `graph.json` between otherwise-identical runs. Chunks are now merged in deterministic submission order after the pool drains (matching the serial path); the progress callback still fires in completion order so long local runs aren't silent. Note: the semantic content the LLM extracts is itself nondeterministic run-to-run — this fix removes the pipeline's own ordering churn, not the model's variance. diff --git a/graphify/__main__.py b/graphify/__main__.py index 59dcd70a5..5e12119a2 100644 --- a/graphify/__main__.py +++ b/graphify/__main__.py @@ -22,6 +22,7 @@ # Defined once in graphify.paths so the security/callflow path guards honour the # same override (#1423). from graphify.paths import GRAPHIFY_OUT as _GRAPHIFY_OUT +from graphify.paths import query_config_defaults as _query_config_defaults @functools.lru_cache(maxsize=None) @@ -2305,7 +2306,10 @@ def main() -> None: print(" --dfs use depth-first instead of breadth-first") print(" --context C explicit edge-context filter (repeatable)") print(" --budget N cap output at N tokens (default 2000)") + print(" --depth N traversal depth (default 2)") print(" --graph path to graph.json (default graphify-out/graph.json)") + print(" (defaults for --budget/--depth can be set per-project in") + print(" graphify-out/config.json: {\"query\": {\"default_budget\": N, \"default_depth\": N}}; CLI flags override)") print(" affected \"X\" reverse traversal to find nodes impacted by X") print(" --relation R edge relation to traverse in reverse (repeatable)") print(" --depth N reverse traversal depth (default 2)") @@ -2842,7 +2846,7 @@ def main() -> None: sys.exit(1) elif cmd == "query": if len(sys.argv) < 3: - print("Usage: graphify query \"\" [--dfs] [--context C] [--budget N] [--graph path]", file=sys.stderr) + print("Usage: graphify query \"\" [--dfs] [--context C] [--budget N] [--depth N] [--graph path]", file=sys.stderr) sys.exit(1) from graphify.serve import _query_graph_text from graphify.security import sanitize_label @@ -2851,7 +2855,15 @@ def main() -> None: question = sys.argv[2] use_dfs = "--dfs" in sys.argv + # Built-in defaults, optionally seeded from graphify-out/config.json; + # CLI flags below still override the config (#1654). budget = 2000 + depth = 2 + _cfg_defaults = _query_config_defaults() + if "budget" in _cfg_defaults: + budget = _cfg_defaults["budget"] + if "depth" in _cfg_defaults: + depth = _cfg_defaults["depth"] graph_path = _default_graph_path() context_filters: list[str] = [] args = sys.argv[3:] @@ -2871,6 +2883,26 @@ def main() -> None: print(f"error: --budget must be an integer", file=sys.stderr) sys.exit(1) i += 1 + elif args[i] == "--depth" and i + 1 < len(args): + try: + depth = int(args[i + 1]) + except ValueError: + print(f"error: --depth must be an integer", file=sys.stderr) + sys.exit(1) + if depth <= 0: + print("error: --depth must be a positive integer", file=sys.stderr) + sys.exit(1) + i += 2 + elif args[i].startswith("--depth="): + try: + depth = int(args[i].split("=", 1)[1]) + except ValueError: + print(f"error: --depth must be an integer", file=sys.stderr) + sys.exit(1) + if depth <= 0: + print("error: --depth must be a positive integer", file=sys.stderr) + sys.exit(1) + i += 1 elif args[i] == "--context" and i + 1 < len(args): context_filters.append(args[i + 1]) i += 2 @@ -2922,7 +2954,7 @@ def main() -> None: G, question, mode=_mode, - depth=2, + depth=depth, token_budget=budget, context_filters=context_filters, ) @@ -2932,7 +2964,7 @@ def main() -> None: corpus=str(gp), result=_result, mode=_mode, - depth=2, + depth=depth, token_budget=budget, duration_ms=(_time.perf_counter() - _t0) * 1000, ) diff --git a/graphify/paths.py b/graphify/paths.py index d2bfdd9f5..dd61b82ab 100644 --- a/graphify/paths.py +++ b/graphify/paths.py @@ -16,6 +16,7 @@ from __future__ import annotations +import json import os import re from pathlib import Path, PurePosixPath @@ -232,3 +233,61 @@ def default_graph_json() -> str: the path is passed explicitly (#1423). """ return str(out_path("graph.json")) + + +def query_config_defaults(config_path: Path | None = None) -> dict[str, int]: + """Per-project ``query`` defaults read from ``graphify-out/config.json``. + + Returns any ``budget``/``depth`` overrides the sidecar declares, as a dict + that may contain either, both, or neither key. The values seed the CLI's + built-in defaults before flag parsing, so a CLI flag still wins (#1654). + + The file may nest the settings under a ``"query"`` object (the documented + shape) or place them at the top level, and either the + ``default_budget``/``default_depth`` or bare ``budget``/``depth`` spelling + is accepted:: + + {"query": {"default_budget": 4000, "default_depth": 3}} + + A missing file, unreadable file, malformed JSON, wrong top-level type, or + non-positive/non-integer values all degrade to an empty dict so a bad + config never crashes a query. A whole-valued float (``4000.0``) is coerced + to ``int``; a fractional float (``4000.5``), bool, string, or null is + rejected. When both a nested and a flat value are present the nested + ``query`` object wins. + """ + defaults: dict[str, int] = {} + target = config_path if config_path is not None else out_path("config.json") + try: + raw = json.loads(Path(target).read_text(encoding="utf-8")) + except (OSError, ValueError): + return defaults + if not isinstance(raw, dict): + return defaults + section = raw.get("query") + if not isinstance(section, dict): + section = {} + + def _pick(*keys: str) -> int | None: + for source in (section, raw): + for key in keys: + value = source.get(key) + # bool is an int subclass; reject it up front so True/False can + # never read as 1/0. + if isinstance(value, bool): + continue + if isinstance(value, int) and value > 0: + return value + # Accept a whole-valued float (4000.0 -> 4000) from a hand-written + # config; reject a fractional one (4000.5) and non-positive values. + if isinstance(value, float) and value.is_integer() and value > 0: + return int(value) + return None + + budget = _pick("default_budget", "budget") + if budget is not None: + defaults["budget"] = budget + depth = _pick("default_depth", "depth") + if depth is not None: + defaults["depth"] = depth + return defaults diff --git a/tests/test_paths.py b/tests/test_paths.py index e0e1a2f00..efdf33b64 100644 --- a/tests/test_paths.py +++ b/tests/test_paths.py @@ -2,11 +2,14 @@ from __future__ import annotations +import json + import pytest from graphify.paths import ( _is_test_path, disambiguate_ambiguous_candidates, + query_config_defaults, ) @@ -97,3 +100,80 @@ def test_disambiguate_path_proximity_same_dir() -> None: "pkg/a/caller.py", ) assert winner == "near" + + +# --- query_config_defaults (per-project config.json, #1654) ----------------- + + +def _write_config(tmp_path, data) -> None: + (tmp_path / "config.json").write_text(json.dumps(data), encoding="utf-8") + + +def test_query_config_defaults_nested_query_object(tmp_path) -> None: + _write_config(tmp_path, {"query": {"default_budget": 4000, "default_depth": 3}}) + assert query_config_defaults(tmp_path / "config.json") == {"budget": 4000, "depth": 3} + + +def test_query_config_defaults_flat_keys(tmp_path) -> None: + _write_config(tmp_path, {"budget": 1234, "depth": 5}) + assert query_config_defaults(tmp_path / "config.json") == {"budget": 1234, "depth": 5} + + +def test_query_config_defaults_partial(tmp_path) -> None: + _write_config(tmp_path, {"query": {"default_depth": 4}}) + assert query_config_defaults(tmp_path / "config.json") == {"depth": 4} + + +def test_query_config_defaults_nested_wins_over_flat(tmp_path) -> None: + _write_config(tmp_path, {"query": {"default_budget": 4000}, "budget": 9999}) + assert query_config_defaults(tmp_path / "config.json") == {"budget": 4000} + + +def test_query_config_defaults_missing_file(tmp_path) -> None: + assert query_config_defaults(tmp_path / "does-not-exist.json") == {} + + +def test_query_config_defaults_malformed_json(tmp_path) -> None: + (tmp_path / "config.json").write_text("{not valid json", encoding="utf-8") + assert query_config_defaults(tmp_path / "config.json") == {} + + +def test_query_config_defaults_rejects_bad_values(tmp_path) -> None: + # non-int, bool, zero, and negative values are all ignored. + _write_config( + tmp_path, + {"query": {"default_budget": "lots", "default_depth": -1}, "budget": True, "depth": 0}, + ) + assert query_config_defaults(tmp_path / "config.json") == {} + + +def test_query_config_defaults_non_dict_top_level(tmp_path) -> None: + (tmp_path / "config.json").write_text(json.dumps([1, 2, 3]), encoding="utf-8") + assert query_config_defaults(tmp_path / "config.json") == {} + + +def test_query_config_defaults_whole_valued_float_accepted(tmp_path) -> None: + # A hand-written config often carries floats; a whole-valued one coerces. + _write_config(tmp_path, {"query": {"default_budget": 4000.0, "default_depth": 3.0}}) + assert query_config_defaults(tmp_path / "config.json") == {"budget": 4000, "depth": 3} + + +def test_query_config_defaults_fractional_float_rejected(tmp_path) -> None: + # A fractional float can't be an integer depth/budget, so it degrades. + _write_config(tmp_path, {"query": {"default_budget": 4000.5, "default_depth": 2.5}}) + assert query_config_defaults(tmp_path / "config.json") == {} + + +def test_query_config_defaults_absolute_graphify_out(tmp_path, monkeypatch) -> None: + # With no explicit path, the reader resolves via out_path(), which honours + # an absolute GRAPHIFY_OUT override (#1423 / #686). + import graphify.paths as paths + + out_dir = tmp_path / "shared" / "graphify-out" + out_dir.mkdir(parents=True) + (out_dir / "config.json").write_text( + json.dumps({"query": {"default_budget": 7000, "default_depth": 4}}), + encoding="utf-8", + ) + monkeypatch.setattr(paths, "GRAPHIFY_OUT", str(out_dir)) + assert query_config_defaults() == {"budget": 7000, "depth": 4} diff --git a/tests/test_query_cli.py b/tests/test_query_cli.py index cf8eb6e56..a5c39bd32 100644 --- a/tests/test_query_cli.py +++ b/tests/test_query_cli.py @@ -51,6 +51,166 @@ def test_query_cli_heuristic_context_filter(monkeypatch, tmp_path, capsys): assert "build" not in out +def _capture_query_args(monkeypatch): + """Capture the depth/token_budget the query handler resolves. + + Stubs both consumers (``serve._query_graph_text`` and + ``querylog.log_query``) since the handler imports them locally, and returns + a dict the assertions read after ``main()`` runs. + """ + import graphify.serve as servemod + import graphify.querylog as querylogmod + + captured: dict[str, int] = {} + + def _fake_query(G, question, *, mode, depth, token_budget, context_filters): + captured["depth"] = depth + captured["token_budget"] = token_budget + return "stub-result" + + def _fake_log(**kwargs): + captured["log_depth"] = kwargs.get("depth") + captured["log_budget"] = kwargs.get("token_budget") + + monkeypatch.setattr(servemod, "_query_graph_text", _fake_query) + monkeypatch.setattr(querylogmod, "log_query", _fake_log) + monkeypatch.setattr(mainmod, "_check_skill_version", lambda _: None) + return captured + + +def test_query_cli_config_sets_budget_and_depth(monkeypatch, tmp_path, capsys): + graph_path = _write_graph(tmp_path) + (tmp_path / "graphify-out").mkdir() + (tmp_path / "graphify-out" / "config.json").write_text( + json.dumps({"query": {"default_budget": 4000, "default_depth": 3}}), + encoding="utf-8", + ) + monkeypatch.chdir(tmp_path) + captured = _capture_query_args(monkeypatch) + monkeypatch.setattr( + mainmod.sys, "argv", ["graphify", "query", "extract", "--graph", str(graph_path)] + ) + mainmod.main() + assert captured["depth"] == 3 + assert captured["token_budget"] == 4000 + assert captured["log_depth"] == 3 + assert captured["log_budget"] == 4000 + + +def test_query_cli_flags_override_config(monkeypatch, tmp_path, capsys): + graph_path = _write_graph(tmp_path) + (tmp_path / "graphify-out").mkdir() + (tmp_path / "graphify-out" / "config.json").write_text( + json.dumps({"query": {"default_budget": 4000, "default_depth": 3}}), + encoding="utf-8", + ) + monkeypatch.chdir(tmp_path) + captured = _capture_query_args(monkeypatch) + monkeypatch.setattr( + mainmod.sys, + "argv", + ["graphify", "query", "extract", "--budget", "1000", "--depth", "5", "--graph", str(graph_path)], + ) + mainmod.main() + assert captured["depth"] == 5 + assert captured["token_budget"] == 1000 + + +def test_query_cli_no_config_uses_builtin_defaults(monkeypatch, tmp_path, capsys): + graph_path = _write_graph(tmp_path) + monkeypatch.chdir(tmp_path) # no graphify-out/config.json here + captured = _capture_query_args(monkeypatch) + monkeypatch.setattr( + mainmod.sys, "argv", ["graphify", "query", "extract", "--graph", str(graph_path)] + ) + mainmod.main() + assert captured["depth"] == 2 + assert captured["token_budget"] == 2000 + + +def test_query_cli_malformed_config_falls_back(monkeypatch, tmp_path, capsys): + graph_path = _write_graph(tmp_path) + (tmp_path / "graphify-out").mkdir() + (tmp_path / "graphify-out" / "config.json").write_text("{not valid", encoding="utf-8") + monkeypatch.chdir(tmp_path) + captured = _capture_query_args(monkeypatch) + monkeypatch.setattr( + mainmod.sys, "argv", ["graphify", "query", "extract", "--graph", str(graph_path)] + ) + mainmod.main() # must not raise + assert captured["depth"] == 2 + assert captured["token_budget"] == 2000 + + +def test_query_cli_depth_rejects_non_integer(monkeypatch, tmp_path, capsys): + """`--depth notanint` errors cleanly (exit 1), like the affected command.""" + import pytest + + graph_path = _write_graph(tmp_path) + monkeypatch.chdir(tmp_path) + _capture_query_args(monkeypatch) + monkeypatch.setattr( + mainmod.sys, + "argv", + ["graphify", "query", "extract", "--depth", "notanint", "--graph", str(graph_path)], + ) + with pytest.raises(SystemExit) as exc: + mainmod.main() + assert exc.value.code == 1 + assert "--depth must be an integer" in capsys.readouterr().err + + +def test_query_cli_depth_trailing_no_value_is_graceful(monkeypatch, tmp_path, capsys): + """A trailing `--depth` with no value is ignored, not a crash; default holds.""" + graph_path = _write_graph(tmp_path) + monkeypatch.chdir(tmp_path) + captured = _capture_query_args(monkeypatch) + monkeypatch.setattr( + mainmod.sys, + "argv", + ["graphify", "query", "extract", "--graph", str(graph_path), "--depth"], + ) + mainmod.main() # must not raise + assert captured["depth"] == 2 + assert captured["token_budget"] == 2000 + + +def test_query_cli_depth_rejects_zero(monkeypatch, tmp_path, capsys): + """`--depth 0` is non-positive and must be rejected, not passed to traversal.""" + import pytest + + graph_path = _write_graph(tmp_path) + monkeypatch.chdir(tmp_path) + _capture_query_args(monkeypatch) + monkeypatch.setattr( + mainmod.sys, + "argv", + ["graphify", "query", "extract", "--depth", "0", "--graph", str(graph_path)], + ) + with pytest.raises(SystemExit) as exc: + mainmod.main() + assert exc.value.code == 1 + assert "--depth must be a positive integer" in capsys.readouterr().err + + +def test_query_cli_depth_rejects_negative(monkeypatch, tmp_path, capsys): + """`--depth -1` is non-positive and must be rejected.""" + import pytest + + graph_path = _write_graph(tmp_path) + monkeypatch.chdir(tmp_path) + _capture_query_args(monkeypatch) + monkeypatch.setattr( + mainmod.sys, + "argv", + ["graphify", "query", "extract", "--depth", "-1", "--graph", str(graph_path)], + ) + with pytest.raises(SystemExit) as exc: + mainmod.main() + assert exc.value.code == 1 + assert "--depth must be a positive integer" in capsys.readouterr().err + + def test_query_cli_rejects_oversized_graph(monkeypatch, tmp_path, capsys): """#F4: query CLI must refuse to parse a graph.json that exceeds the cap.""" import pytest