From aee27b17cf3764eac097e6d142a6a4b08590a0e1 Mon Sep 17 00:00:00 2001 From: edgestack-ai <260736655+edgestack-ai@users.noreply.github.com> Date: Fri, 3 Jul 2026 07:40:02 +0200 Subject: [PATCH 1/3] feat: stamp graph.json with generated_at, warn on stale graphify query graph.json now records an ISO generated_at timestamp alongside the existing built_at_commit, written in export.to_json (the single chokepoint every build/update/watch path already funnels through). graphify query compares that stamp against the last commit time of the repo the graph indexes (resolved the same way build.py's _infer_merge_root already does, via the .graphify_root sidecar) and prints one warning line to stderr when the graph predates it. Missing or unreadable stamps (graphs built by an older graphify version) warn too, telling the caller to regenerate. Outside a git repo, or if git isn't available, the check is silently skipped rather than raising - query output is otherwise unchanged. --- graphify/__main__.py | 7 ++++++ graphify/export.py | 54 +++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 60 insertions(+), 1 deletion(-) diff --git a/graphify/__main__.py b/graphify/__main__.py index 59dcd70a5..aac98fd4c 100644 --- a/graphify/__main__.py +++ b/graphify/__main__.py @@ -2897,6 +2897,13 @@ def main() -> None: _raw = _json.loads(gp.read_text(encoding="utf-8")) if "links" not in _raw and "edges" in _raw: _raw = dict(_raw, links=_raw["edges"]) + try: + from graphify.export import check_staleness + _stale_warning = check_staleness(_raw, gp) + if _stale_warning: + print(_stale_warning, file=sys.stderr) + except Exception: + pass # staleness check is diagnostic-only, never blocks a query try: G = json_graph.node_link_graph(_raw, edges="links") except TypeError: diff --git a/graphify/export.py b/graphify/export.py index 176b17909..8c1edf5fe 100644 --- a/graphify/export.py +++ b/graphify/export.py @@ -9,7 +9,7 @@ import shutil import sys from collections import Counter -from datetime import date +from datetime import date, datetime, timezone from pathlib import Path import networkx as nx from networkx.readwrite import json_graph @@ -482,6 +482,57 @@ def _git_head() -> str | None: return None +def check_staleness(raw: dict, graph_path: Path) -> str | None: + """Return a one-line stale-graph warning if graph.json is out of date, else None. + + "Out of date" means the on-disk graph.json predates the last git commit of + the repo it indexes. Missing/unreadable stamps (graphs built by an older + graphify version) also warn, telling the caller to regenerate. Never + raises — any failure resolving the repo root or running git is treated as + "can't tell", not an error, so this is safe to call outside a git repo. + """ + generated_at = raw.get("generated_at") + if not generated_at: + return ( + f"[graphify] warning: {graph_path} has no generation timestamp " + "(built by an older graphify version) - no stamp, regenerate with `graphify .`" + ) + try: + stamp = datetime.fromisoformat(generated_at) + except (TypeError, ValueError): + return ( + f"[graphify] warning: {graph_path} has an unreadable generation " + "timestamp - no stamp, regenerate with `graphify .`" + ) + + import subprocess as _sp + from graphify.build import _infer_merge_root + repo_root = _infer_merge_root(graph_path) + if repo_root is None: + return None # can't resolve the indexed root - nothing to compare against + try: + r = _sp.run( + ["git", "-C", str(repo_root), "log", "-1", "--format=%cI"], + capture_output=True, text=True, timeout=5, + ) + except Exception: + return None # git not available - nothing to compare against + if r.returncode != 0 or not r.stdout.strip(): + return None # not a git repo (or no commits yet) + try: + last_commit = datetime.fromisoformat(r.stdout.strip()) + except ValueError: + return None + + if last_commit > stamp: + return ( + f"[graphify] warning: graph.json was generated {generated_at} but " + f"the indexed repo's last commit is {r.stdout.strip()} - graph is " + "stale, run `graphify .` (or `graphify update`) to refresh" + ) + return None + + def to_json(G: nx.Graph, communities: dict[int, list[str]], output_path: str, *, force: bool = False, built_at_commit: str | None = None, community_labels: dict[int, str] | None = None) -> bool: # Safety check: refuse to silently shrink an existing graph (#479) existing_path = Path(output_path) @@ -537,6 +588,7 @@ def to_json(G: nx.Graph, communities: dict[int, list[str]], output_path: str, *, commit = built_at_commit if built_at_commit is not None else _git_head() if commit: data["built_at_commit"] = commit + data["generated_at"] = datetime.now(timezone.utc).isoformat() with open(output_path, "w", encoding="utf-8") as f: # nosec json.dump(data, f, indent=2) return True From d1692f49bde3cfeb78eac140b4c57b9c3864527a Mon Sep 17 00:00:00 2001 From: edgestack-ai <260736655+edgestack-ai@users.noreply.github.com> Date: Fri, 3 Jul 2026 08:04:39 +0200 Subject: [PATCH 2/3] fix: stamp every graph.json writer, record indexed repo root for staleness Root-cause fix for the review round on the generated_at/staleness feature: 1. graph.json stamping was only in export.to_json(), so --no-cluster extract (__main__.py) and --no-cluster update (watch.py) wrote unstamped graphs that check_staleness could never flag. Extracted the stamping logic into one chokepoint, stamp_graph_metadata(), that every writer now calls. 2. check_staleness inferred the indexed repo's root from the graph file's own location, so a graph written via --out could never be compared against the right repo. The root is now recorded IN the graph at write time (indexed_repo_root) and check_staleness prefers that, falling back to location-based inference only for legacy graphs that predate the field. 3. _canonical_graph_for_compare/_canonical_topology_for_compare (watch.py) now also exclude generated_at/indexed_repo_root from same-graph/topology comparisons, alongside the existing built_at_commit exclusion - required so the --no-cluster update path's "no changes, left untouched" detection keeps working now that every write is timestamped. Added coverage for all 3 reviewer repro cases: --no-cluster extract stamps (test_extract_cli.py), --no-cluster update stamps (test_watch.py), and --out-elsewhere staleness detection via the recorded root (test_export.py). --- graphify/__main__.py | 6 +- graphify/export.py | 38 +++++++++--- graphify/watch.py | 11 +++- tests/test_export.py | 123 ++++++++++++++++++++++++++++++++++++++ tests/test_extract_cli.py | 33 ++++++++++ tests/test_watch.py | 32 ++++++++++ 6 files changed, 233 insertions(+), 10 deletions(-) diff --git a/graphify/__main__.py b/graphify/__main__.py index aac98fd4c..771f5055c 100644 --- a/graphify/__main__.py +++ b/graphify/__main__.py @@ -3673,7 +3673,7 @@ def main() -> None: json.dumps(analysis, indent=2, ensure_ascii=False), encoding="utf-8", ) - to_json(G, communities, str(out / "graph.json"), community_labels=labels) + to_json(G, communities, str(out / "graph.json"), community_labels=labels, indexed_repo_root=watch_path) labels_path.write_text(json.dumps({str(k): v for k, v in labels.items()}, ensure_ascii=False), encoding="utf-8") # Membership signatures beside the labels so a later cluster-only can detect # which communities changed and avoid reusing a stale label (see reuse above). @@ -4914,6 +4914,8 @@ def _progress(idx: int, total: int, _result: dict) -> None: _node_sf.get(_e.get("source")) or _node_sf.get(_e.get("target")) or "" ) _backup(graphify_out) + from graphify.export import stamp_graph_metadata as _stamp_graph_metadata + _stamp_graph_metadata(merged, indexed_repo_root=target) graph_json_path.write_text( json.dumps(merged, indent=2), encoding="utf-8" ) @@ -4998,7 +5000,7 @@ def _progress(idx: int, total: int, _result: dict) -> None: from graphify.export import backup_if_protected as _backup _backup(graphify_out) - _to_json(G, communities, str(graph_json_path), force=True) + _to_json(G, communities, str(graph_json_path), force=True, indexed_repo_root=target) stages.mark("export") if merged.get("output_tokens", 0) > 0: (graphify_out / ".graphify_semantic_marker").write_text( diff --git a/graphify/export.py b/graphify/export.py index 8c1edf5fe..f44f401e9 100644 --- a/graphify/export.py +++ b/graphify/export.py @@ -506,8 +506,14 @@ def check_staleness(raw: dict, graph_path: Path) -> str | None: ) import subprocess as _sp - from graphify.build import _infer_merge_root - repo_root = _infer_merge_root(graph_path) + # Prefer the indexed repo root recorded IN the graph at write time (stamp_graph_metadata) — + # correct even when the graph was written to --out . Only legacy graphs + # (written before this field existed) fall back to inferring the root from the + # graph file's own location, which is wrong for an out-of-tree --out path. + repo_root = raw.get("indexed_repo_root") + if not repo_root: + from graphify.build import _infer_merge_root + repo_root = _infer_merge_root(graph_path) if repo_root is None: return None # can't resolve the indexed root - nothing to compare against try: @@ -533,7 +539,28 @@ def check_staleness(raw: dict, graph_path: Path) -> str | None: return None -def to_json(G: nx.Graph, communities: dict[int, list[str]], output_path: str, *, force: bool = False, built_at_commit: str | None = None, community_labels: dict[int, str] | None = None) -> bool: +def stamp_graph_metadata(data: dict, *, indexed_repo_root: "str | os.PathLike | None" = None, built_at_commit: str | None = None) -> None: + """Stamp graph.json metadata that ``check_staleness`` reads: generation + timestamp, build commit, and the indexed repo's root. + + This is the ONE chokepoint every graph.json writer must call — clustered + (``to_json``) and raw ``--no-cluster`` writers alike — so staleness detection + works regardless of which code path produced the file or where ``--out`` put + it (#1618-followup: a prior fix stamped only ``to_json``, leaving --no-cluster + writes and --out-elsewhere graphs unstamped/unresolvable). + """ + commit = built_at_commit if built_at_commit is not None else _git_head() + if commit: + data["built_at_commit"] = commit + data["generated_at"] = datetime.now(timezone.utc).isoformat() + if indexed_repo_root is not None: + try: + data["indexed_repo_root"] = str(Path(indexed_repo_root).resolve()) + except OSError: + data["indexed_repo_root"] = str(indexed_repo_root) + + +def to_json(G: nx.Graph, communities: dict[int, list[str]], output_path: str, *, force: bool = False, built_at_commit: str | None = None, community_labels: dict[int, str] | None = None, indexed_repo_root: "str | os.PathLike | None" = None) -> bool: # Safety check: refuse to silently shrink an existing graph (#479) existing_path = Path(output_path) if not force and existing_path.exists(): @@ -585,10 +612,7 @@ def to_json(G: nx.Graph, communities: dict[int, list[str]], output_path: str, *, link["source"] = true_src link["target"] = true_tgt data["hyperedges"] = getattr(G, "graph", {}).get("hyperedges", []) - commit = built_at_commit if built_at_commit is not None else _git_head() - if commit: - data["built_at_commit"] = commit - data["generated_at"] = datetime.now(timezone.utc).isoformat() + stamp_graph_metadata(data, indexed_repo_root=indexed_repo_root, built_at_commit=built_at_commit) with open(output_path, "w", encoding="utf-8") as f: # nosec json.dump(data, f, indent=2) return True diff --git a/graphify/watch.py b/graphify/watch.py index c3fa5ed84..8fffcab46 100644 --- a/graphify/watch.py +++ b/graphify/watch.py @@ -272,7 +272,12 @@ def _node_community_map(graph_data: dict) -> dict[str, int]: def _canonical_graph_for_compare(graph_data: dict) -> dict: canonical = dict(graph_data) + # Build-time metadata stamped by stamp_graph_metadata() varies every run + # (timestamp always changes; commit/root can too) without reflecting an + # actual graph-content change, so it must never affect a same-graph verdict. canonical.pop("built_at_commit", None) + canonical.pop("generated_at", None) + canonical.pop("indexed_repo_root", None) for key in ("nodes", "links", "edges", "hyperedges"): if key in canonical and isinstance(canonical[key], list): canonical[key] = sorted( @@ -285,6 +290,8 @@ def _canonical_graph_for_compare(graph_data: dict) -> dict: def _canonical_topology_for_compare(graph_data: dict) -> dict: canonical = dict(graph_data) canonical.pop("built_at_commit", None) + canonical.pop("generated_at", None) + canonical.pop("indexed_repo_root", None) nodes = canonical.get("nodes") if isinstance(nodes, list): @@ -713,11 +720,13 @@ def _edge_evicted(e: dict) -> bool: # without it, --no-cluster + repeated `update` accumulate duplicates and edge # counts diverge across build modes (#1317). from graphify.build import dedupe_edges as _dedupe_edges, dedupe_nodes as _dedupe_nodes + from graphify.export import stamp_graph_metadata as _stamp_graph_metadata candidate_graph_data = { **{k: v for k, v in result.items() if k not in ("edges", "nodes")}, "nodes": _dedupe_nodes(result.get("nodes", [])), "links": _dedupe_edges(result.get("edges", [])), } + _stamp_graph_metadata(candidate_graph_data, indexed_repo_root=project_root, built_at_commit=commit) candidate_graph_text = _json_text(candidate_graph_data) same_graph = False if existing_graph.exists(): @@ -817,7 +826,7 @@ def _edge_evicted(e: dict) -> bool: report_path = out / "GRAPH_REPORT.md" labels_json = json.dumps({str(k): v for k, v in sorted(labels.items())}, ensure_ascii=False, indent=2) + "\n" graph_tmp = out / ".graph.tmp.json" - json_written = to_json(G, communities, str(graph_tmp), force=True, built_at_commit=commit) + json_written = to_json(G, communities, str(graph_tmp), force=True, built_at_commit=commit, indexed_repo_root=project_root) if not json_written: return False candidate_graph_data = json.loads(graph_tmp.read_text(encoding="utf-8")) diff --git a/tests/test_export.py b/tests/test_export.py index be4743bc5..4c31c9051 100644 --- a/tests/test_export.py +++ b/tests/test_export.py @@ -1,5 +1,6 @@ import json import math +import os import re import tempfile from pathlib import Path @@ -603,3 +604,125 @@ def test_backup_env_disable(tmp_path, monkeypatch): (tmp_path / "graph.json").write_text('{"nodes":[],"links":[]}') (tmp_path / ".graphify_semantic_marker").write_text("{}") assert backup_if_protected(tmp_path) is None + + +# --- staleness chokepoint: every graph.json writer must stamp via +# stamp_graph_metadata, and check_staleness must use the recorded indexed +# repo root rather than inferring it from the graph file's own location --- + + +def test_to_json_stamps_generated_at_and_indexed_repo_root(tmp_path): + """to_json (the clustered writer) records both generated_at and, when the + caller passes indexed_repo_root, the resolved indexed repo root.""" + G = make_graph() + communities = cluster(G) + out = tmp_path / "graph.json" + repo_root = tmp_path / "somewhere-else" + repo_root.mkdir() + to_json(G, communities, str(out), indexed_repo_root=repo_root) + data = json.loads(out.read_text()) + assert "generated_at" in data + assert data["indexed_repo_root"] == str(repo_root.resolve()) + + +def test_to_json_omits_indexed_repo_root_when_not_given(tmp_path): + """Without an explicit indexed_repo_root, to_json still stamps generated_at + but doesn't fabricate a root (legacy-graph shape stays legacy).""" + G = make_graph() + communities = cluster(G) + out = tmp_path / "graph.json" + to_json(G, communities, str(out)) + data = json.loads(out.read_text()) + assert "generated_at" in data + assert "indexed_repo_root" not in data + + +def _init_repo_with_one_commit(repo_dir: Path) -> None: + import subprocess + repo_dir.mkdir(parents=True, exist_ok=True) + subprocess.run(["git", "init", "-q", str(repo_dir)], check=True) + subprocess.run(["git", "-C", str(repo_dir), "config", "user.email", "t@example.com"], check=True) + subprocess.run(["git", "-C", str(repo_dir), "config", "user.name", "Test"], check=True) + (repo_dir / "a.py").write_text("def a():\n return 1\n", encoding="utf-8") + subprocess.run(["git", "-C", str(repo_dir), "add", "a.py"], check=True) + subprocess.run(["git", "-C", str(repo_dir), "commit", "-q", "-m", "init"], check=True) + + +def _last_commit_iso(repo_dir: Path) -> str: + import subprocess + r = subprocess.run( + ["git", "-C", str(repo_dir), "log", "-1", "--format=%cI"], + capture_output=True, text=True, check=True, + ) + return r.stdout.strip() + + +def _advance_repo(repo_dir: Path) -> None: + """Commit again with an explicit committer date safely past the repo's + current HEAD, so ordering doesn't depend on git's 1-second timestamp + resolution racing the test's wall-clock stamp.""" + import subprocess + from datetime import datetime, timedelta, timezone + current = datetime.fromisoformat(_last_commit_iso(repo_dir)) + later = (current + timedelta(seconds=5)).isoformat() + (repo_dir / "a.py").write_text("def a():\n return 2\n", encoding="utf-8") + subprocess.run(["git", "-C", str(repo_dir), "add", "a.py"], check=True) + env = {**os.environ, "GIT_AUTHOR_DATE": later, "GIT_COMMITTER_DATE": later} + subprocess.run(["git", "-C", str(repo_dir), "commit", "-q", "-m", "advance"], check=True, env=env) + + +def test_check_staleness_uses_recorded_root_for_out_elsewhere_graph(tmp_path): + """#2 repro: a graph.json written via `--out ` (unrelated to the + indexed repo by directory structure) must still warn once the indexed repo + advances, because the root was recorded IN the graph at write time rather + than inferred from the graph file's own location.""" + from graphify.export import check_staleness + + repo_dir = tmp_path / "repo" + _init_repo_with_one_commit(repo_dir) + + # graph.json lives in a completely unrelated directory - inferring the + # root from graph_path's location (parent.parent) would resolve to + # tmp_path itself, which is not a git repo at all. + elsewhere = tmp_path / "elsewhere" / "nested" + elsewhere.mkdir(parents=True) + graph_path = elsewhere / "graph.json" + + from datetime import datetime, timezone + stamp = datetime.now(timezone.utc).isoformat() + raw = {"generated_at": stamp, "indexed_repo_root": str(repo_dir.resolve())} + graph_path.write_text(json.dumps(raw)) + + # Not stale yet - graph was just generated, repo hasn't moved since. + assert check_staleness(raw, graph_path) is None + + _advance_repo(repo_dir) + warning = check_staleness(raw, graph_path) + assert warning is not None + assert "stale" in warning + + +def test_check_staleness_legacy_graph_without_recorded_root_cannot_detect_out_elsewhere(tmp_path): + """Documents the pre-fix limitation for LEGACY graphs (no indexed_repo_root): + location-based inference is the only option left, and for an --out-elsewhere + graph it resolves to a non-repo directory, so staleness silently can't be + determined (no false positive, no false negative - just "can't tell").""" + from graphify.export import check_staleness + + repo_dir = tmp_path / "repo" + _init_repo_with_one_commit(repo_dir) + + elsewhere = tmp_path / "elsewhere" / "nested" + elsewhere.mkdir(parents=True) + graph_path = elsewhere / "graph.json" + + from datetime import datetime, timezone + stamp = datetime.now(timezone.utc).isoformat() + raw = {"generated_at": stamp} # no indexed_repo_root - legacy shape + graph_path.write_text(json.dumps(raw)) + + _advance_repo(repo_dir) + # location-based fallback resolves to `elsewhere` (parent.parent of + # graph_path), which isn't a git repo, so git fails and the check + # abstains rather than warning. + assert check_staleness(raw, graph_path) is None diff --git a/tests/test_extract_cli.py b/tests/test_extract_cli.py index c301c50e5..9d75ad6de 100644 --- a/tests/test_extract_cli.py +++ b/tests/test_extract_cli.py @@ -1,6 +1,8 @@ """Tests for `graphify extract` CLI dispatch path in graphify.__main__.""" from __future__ import annotations +import json + import pytest import graphify.__main__ as mainmod @@ -265,3 +267,34 @@ def test_extract_timing_flag_emits_stage_timings(monkeypatch, tmp_path, capsys): mainmod.main() assert exc2.value.code == 0 assert "graphify timing" not in capsys.readouterr().err + + +def test_extract_no_cluster_stamps_generated_at_and_indexed_repo_root(monkeypatch, tmp_path): + """#1 repro: the --no-cluster raw writer (extract path, __main__.py) must stamp + graph.json exactly like the clustered to_json() writer does - generated_at plus + the indexed repo root - so check_staleness can detect a stale graph regardless + of which extract mode produced it or where --out placed it.""" + code = tmp_path / "code" + code.mkdir() + (code / "a.py").write_text("def a():\n return 1\n") + out_dir = tmp_path / "external-out" + + monkeypatch.setattr(mainmod, "_check_skill_version", lambda _: None) + monkeypatch.setattr( + mainmod.sys, "argv", + ["graphify", "extract", str(code), "--no-cluster", "--out", str(out_dir)], + ) + with pytest.raises(SystemExit) as exc: + mainmod.main() + assert exc.value.code == 0 + + graph_path = out_dir / "graphify-out" / "graph.json" + assert graph_path.exists() + data = json.loads(graph_path.read_text(encoding="utf-8")) + assert "generated_at" in data, ( + "--no-cluster extract must stamp generated_at just like the clustered writer" + ) + assert data.get("indexed_repo_root") == str(code.resolve()), ( + '--no-cluster extract must record the indexed repo root (the scanned "code" ' + "dir), not wherever --out happens to point" + ) diff --git a/tests/test_watch.py b/tests/test_watch.py index f1b845125..1ffb41cd2 100644 --- a/tests/test_watch.py +++ b/tests/test_watch.py @@ -721,6 +721,38 @@ def test_rebuild_code_accepts_repo_relative_changed_path_for_subdir_root(tmp_pat os.chdir(cwd) +def test_rebuild_code_no_cluster_stamps_generated_at_and_indexed_repo_root(tmp_path): + """#1 repro (second writer site): the --no-cluster raw writer in the UPDATE + path (watch.py's _rebuild_code) must stamp graph.json exactly like to_json() + does, on both the first write and a subsequent re-write, so check_staleness + can detect a stale graph produced by `graphify update --no-cluster`.""" + from graphify.watch import _rebuild_code + + project = tmp_path / "proj" + project.mkdir() + (project / "a.py").write_text("def a():\n return 1\n", encoding="utf-8") + + assert _rebuild_code(project, no_cluster=True, acquire_lock=False) is True + graph_path = project / "graphify-out" / "graph.json" + before = json.loads(graph_path.read_text(encoding="utf-8")) + assert "generated_at" in before, ( + "--no-cluster update must stamp generated_at just like the clustered writer" + ) + assert before.get("indexed_repo_root") == str(project.resolve()), ( + "--no-cluster update must record the indexed repo root" + ) + + # A second update (new content, forced) must re-stamp too - not just the + # first write of a fresh graph. + (project / "a.py").write_text("def a():\n return 2\n", encoding="utf-8") + assert _rebuild_code( + project, no_cluster=True, acquire_lock=False, force=True, + ) is True + after = json.loads(graph_path.read_text(encoding="utf-8")) + assert "generated_at" in after + assert after.get("indexed_repo_root") == str(project.resolve()) + + # --- #1059: pending-changes queue prevents commit drops under lock contention --- From 2fd14d8a6908b4a1590de76042a6bd9cac225b9c Mon Sep 17 00:00:00 2001 From: edgestack-ai <260736655+edgestack-ai@users.noreply.github.com> Date: Fri, 3 Jul 2026 08:14:40 +0200 Subject: [PATCH 3/3] fix: stamp merge-driver's graph.json write (last unstamped writer) The merge-driver command wrote graph.json directly (__main__.py) bypassing stamp_graph_metadata() (d1692f4's chokepoint), so a merge-committed graph.json carried no generated_at/indexed_repo_root and check_staleness could never flag it as stale. merge-driver has no natural indexed-root argument - git invokes it as `graphify merge-driver %O %A %B` with three throwaway temp file paths, not the real graphify-out/graph.json location. Resolve the actual repo root via `git rev-parse --show-toplevel` (git runs merge drivers with cwd at the top of the work tree), falling back to the current side's previously recorded indexed_repo_root, then to cwd. Added tests/test_merge_driver_cli.py covering both the normal path and the outside-a-git-repo fallback. --- graphify/__main__.py | 25 +++++++- tests/test_merge_driver_cli.py | 113 +++++++++++++++++++++++++++++++++ 2 files changed, 137 insertions(+), 1 deletion(-) create mode 100644 tests/test_merge_driver_cli.py diff --git a/graphify/__main__.py b/graphify/__main__.py index 771f5055c..924d543e4 100644 --- a/graphify/__main__.py +++ b/graphify/__main__.py @@ -3867,7 +3867,7 @@ def _load_graph(p: str): except TypeError: return _jg.node_link_graph(data), data try: - G_cur, _ = _load_graph(_current_path) + G_cur, _data_cur = _load_graph(_current_path) G_oth, _ = _load_graph(_other_path) except Exception as exc: print(f"[graphify merge-driver] error loading graphs: {exc}", file=sys.stderr) @@ -3884,6 +3884,29 @@ def _load_graph(p: str): out_data = _jg.node_link_data(merged, edges="links") except TypeError: out_data = _jg.node_link_data(merged) + # Stamp before writing (d1692f4's chokepoint) so a merged graph.json + # still carries a fresh generated_at/built_at_commit for check_staleness. + # There's no single "indexed root" argument here — git invokes merge + # drivers with (base, current, other) as throwaway temp file paths, not + # the real graphify-out/graph.json location, so _current_path's parent + # dir can't be used the way _infer_merge_root normally would. Instead, + # resolve the actual repo root git is merging in: git runs merge + # drivers with cwd set to the top of the work tree, so `rev-parse + # --show-toplevel` from here IS the indexed repo root. Fall back to + # whatever root the current side was already stamped with, then to cwd. + import subprocess as _sp + try: + _mr = _sp.run( + ["git", "rev-parse", "--show-toplevel"], + capture_output=True, text=True, timeout=3, + ) + _merge_repo_root = _mr.stdout.strip() if _mr.returncode == 0 and _mr.stdout.strip() else None + except Exception: + _merge_repo_root = None + if not _merge_repo_root: + _merge_repo_root = _data_cur.get("indexed_repo_root") or str(Path.cwd()) + from graphify.export import stamp_graph_metadata as _stamp_graph_metadata + _stamp_graph_metadata(out_data, indexed_repo_root=_merge_repo_root) Path(_current_path).write_text(json.dumps(out_data, indent=2), encoding="utf-8") sys.exit(0) diff --git a/tests/test_merge_driver_cli.py b/tests/test_merge_driver_cli.py new file mode 100644 index 000000000..46e0e5677 --- /dev/null +++ b/tests/test_merge_driver_cli.py @@ -0,0 +1,113 @@ +"""`graphify merge-driver` must stamp its output like every other graph.json +writer (d1692f4's stamp_graph_metadata() chokepoint), so a merge-committed +graph.json still carries generated_at/indexed_repo_root for check_staleness. + +Unlike the other writers, merge-driver has no natural "root" argument: git +invokes it as `graphify merge-driver %O %A %B` with three throwaway temp file +paths, not the real graphify-out/graph.json location. The fix resolves the +indexed repo root via `git rev-parse --show-toplevel`, relying on git running +merge drivers with cwd set to the top of the work tree - so these tests set +`cwd` to the repo being merged, exactly like a real git merge invocation. +""" +from __future__ import annotations + +import json +import subprocess +import sys +from pathlib import Path + +PYTHON = sys.executable + + +def _run(args, cwd): + return subprocess.run([PYTHON, "-m", "graphify"] + args, cwd=cwd, + capture_output=True, text=True) + + +def _init_repo_with_one_commit(repo_dir: Path) -> None: + repo_dir.mkdir(parents=True, exist_ok=True) + subprocess.run(["git", "init", "-q", str(repo_dir)], check=True) + subprocess.run(["git", "-C", str(repo_dir), "config", "user.email", "t@example.com"], check=True) + subprocess.run(["git", "-C", str(repo_dir), "config", "user.name", "Test"], check=True) + (repo_dir / "a.py").write_text("def a():\n return 1\n", encoding="utf-8") + subprocess.run(["git", "-C", str(repo_dir), "add", "a.py"], check=True) + subprocess.run(["git", "-C", str(repo_dir), "commit", "-q", "-m", "init"], check=True) + + +def _write_graph(p: Path, node_id: str) -> None: + p.parent.mkdir(parents=True, exist_ok=True) + p.write_text(json.dumps({ + "directed": False, "multigraph": False, "graph": {}, + "nodes": [{"id": node_id}], "links": [], + })) + + +def test_merge_driver_stamps_generated_at_and_indexed_repo_root(tmp_path): + """Reviewer repro: the merge-driver writer (__main__.py) was the last + graph.json writer left unstamped after d1692f4. Its output must gain + generated_at and an indexed_repo_root resolved to the repo git is merging + in - not left absent, and not guessed from the (unrelated) temp file + paths git passes for base/current/other.""" + repo_dir = tmp_path / "repo" + _init_repo_with_one_commit(repo_dir) + + # base/current/other live OUTSIDE the repo, like git's real temp files do. + tmp_side = tmp_path / "merge-tmp" + base = tmp_side / "base.json" + current = tmp_side / "current.json" + other = tmp_side / "other.json" + _write_graph(base, "x") + _write_graph(current, "x") + _write_graph(other, "y") + + r = _run( + ["merge-driver", str(base), str(current), str(other)], + cwd=repo_dir, + ) + assert r.returncode == 0, f"merge-driver failed: {r.stderr}" + + data = json.loads(current.read_text(encoding="utf-8")) + ids = {n["id"] for n in data["nodes"]} + assert ids == {"x", "y"} + assert "generated_at" in data, ( + "merge-driver must stamp generated_at just like every other writer" + ) + assert data.get("indexed_repo_root") == str(repo_dir.resolve()), ( + "merge-driver must record the repo git is merging in (its cwd), not " + "the throwaway base/current/other temp file locations" + ) + + +def test_merge_driver_falls_back_to_current_side_root_outside_git(tmp_path): + """When merge-driver somehow runs outside a git work tree (rev-parse + --show-toplevel fails), it must not fabricate a root out of thin air - + it falls back to whatever indexed_repo_root the current side already + carried, so a legitimately-rooted graph doesn't get silently rebased + onto some throwaway cwd.""" + outside_dir = tmp_path / "not-a-repo" + outside_dir.mkdir() + + tmp_side = tmp_path / "merge-tmp" + prior_root = tmp_path / "prior-root" + prior_root.mkdir() + base = tmp_side / "base.json" + current = tmp_side / "current.json" + other = tmp_side / "other.json" + _write_graph(base, "x") + current.parent.mkdir(parents=True, exist_ok=True) + current.write_text(json.dumps({ + "directed": False, "multigraph": False, "graph": {}, + "nodes": [{"id": "x"}], "links": [], + "indexed_repo_root": str(prior_root.resolve()), + })) + _write_graph(other, "y") + + r = _run( + ["merge-driver", str(base), str(current), str(other)], + cwd=outside_dir, + ) + assert r.returncode == 0, f"merge-driver failed: {r.stderr}" + + data = json.loads(current.read_text(encoding="utf-8")) + assert "generated_at" in data + assert data.get("indexed_repo_root") == str(prior_root.resolve())