diff --git a/graphify/extract.py b/graphify/extract.py index 4a99636b0..75266edf0 100644 --- a/graphify/extract.py +++ b/graphify/extract.py @@ -5304,7 +5304,114 @@ def extract_js(path: Path) -> dict: config = _TS_CONFIG else: config = _JS_CONFIG - return _extract_generic(path, config) + result = _extract_generic(path, config) + if "error" not in result: + _extract_js_rationale(path, result) + return result + + +# ── JS/TS rationale + doc-reference extraction ──────────────────────────────── +# +# Parity with _extract_python_rationale: Python files get rationale nodes from +# docstrings and `# NOTE:`-style comments, but JS/TS comments were discarded +# entirely. That silently drops two high-value signals in mixed corpora: +# 1. rationale comments (`// NOTE:`, `// WHY:`, ...) — same as Python; +# 2. architecture-decision references (`ADR-0011`, `RFC 793`) that teams +# conventionally cite in file/function headers. These are the natural +# join points between code and design docs in the same graph — without +# them, code<->ADR edges never form even when the code cites the ADR. + +_JS_RATIONALE_PREFIXES = ( + "// NOTE:", "// IMPORTANT:", "// HACK:", "// WHY:", "// RATIONALE:", + "// TODO:", "// FIXME:", + "* NOTE:", "* IMPORTANT:", "* HACK:", "* WHY:", "* RATIONALE:", + "* TODO:", "* FIXME:", +) + +# Doc-reference tokens worth first-classing as graph nodes. Deliberately +# conservative: ADR-NNNN (Architecture Decision Records, any zero padding) +# and RFC NNNN / RFC-NNNN. +_JS_DOC_REF_RE = re.compile(r"\b(ADR[- ]?\d{1,5}|RFC[- ]?\d{1,5})\b", re.IGNORECASE) + +# Only look for doc references inside comments, not string literals or code. +_JS_COMMENT_LINE_RE = re.compile(r"^\s*(//|/\*|\*)") + + +def _extract_js_rationale(path: Path, result: dict) -> None: + """Post-pass: extract rationale comments and doc references from JS/TS source. + Mutates result in-place by appending to result['nodes'] and result['edges']. + """ + try: + source_text = path.read_text(encoding="utf-8", errors="replace") + except Exception: + return + + stem = _file_stem(path) + str_path = str(path) + nodes = result["nodes"] + edges = result["edges"] + seen_ids = {n["id"] for n in nodes} + file_nid = _make_id(str(path)) + seen_doc_refs: set[str] = set() + + def _add_rationale(text: str, line: int) -> None: + label = text[:80].replace("\r\n", " ").replace("\r", " ").replace("\n", " ").strip() + rid = _make_id(stem, "rationale", str(line)) + if rid not in seen_ids: + seen_ids.add(rid) + nodes.append({ + "id": rid, + "label": label, + "file_type": "rationale", + "source_file": str_path, + "source_location": f"L{line}", + }) + edges.append({ + "source": rid, + "target": file_nid, + "relation": "rationale_for", + "confidence": "EXTRACTED", + "source_file": str_path, + "source_location": f"L{line}", + "weight": 1.0, + }) + + def _add_doc_ref(token: str, line: int) -> None: + # Normalize "adr 11" / "ADR-0011" spellings to a canonical "ADR-0011" + # style label so references to the same document collapse to one node. + kind, num = re.match(r"([A-Za-z]+)[- ]?(\d+)", token).groups() + kind = kind.upper() + label = f"{kind}-{num.zfill(4)}" if kind == "ADR" else f"{kind}-{num}" + if label in seen_doc_refs: + return + seen_doc_refs.add(label) + rid = _make_id("docref", label) + if rid not in seen_ids: + seen_ids.add(rid) + nodes.append({ + "id": rid, + "label": label, + "file_type": "doc_ref", + "source_file": str_path, + "source_location": f"L{line}", + }) + edges.append({ + "source": file_nid, + "target": rid, + "relation": "cites", + "confidence": "EXTRACTED", + "source_file": str_path, + "source_location": f"L{line}", + "weight": 1.0, + }) + + for lineno, line_text in enumerate(source_text.splitlines(), start=1): + stripped = line_text.strip() + if any(stripped.startswith(p) for p in _JS_RATIONALE_PREFIXES): + _add_rationale(stripped.lstrip("/* "), lineno) + if _JS_COMMENT_LINE_RE.match(line_text): + for m in _JS_DOC_REF_RE.finditer(stripped): + _add_doc_ref(m.group(1), lineno) def extract_svelte(path: Path) -> dict: diff --git a/tests/test_rationale.py b/tests/test_rationale.py index b52aa3909..4c915664d 100644 --- a/tests/test_rationale.py +++ b/tests/test_rationale.py @@ -261,3 +261,72 @@ def normal(self) -> int: f"rationale node {r_id} for ``.{decorated_name}()`` is orphaned " f"(degree 0) after build_from_json" ) + + +# ── JS/TS rationale + doc-reference extraction ──────────────────────────────── + + +def _write_ts(tmp_path: Path, code: str) -> Path: + p = tmp_path / "sample.ts" + p.write_text(textwrap.dedent(code)) + return p + + +def test_js_rationale_comment_extracted(tmp_path): + from graphify.extract import extract_js + path = _write_ts(tmp_path, ''' + // NOTE: must run before compile() or the linker will fail + export function build(): void {} + ''') + result = extract_js(path) + rationale = [n for n in result["nodes"] if n.get("file_type") == "rationale"] + assert any("NOTE" in n["label"] for n in rationale) + + +def test_js_block_comment_rationale_extracted(tmp_path): + from graphify.extract import extract_js + path = _write_ts(tmp_path, ''' + /** + * WHY: retries are capped because the upstream rate-limits at 10 rps. + */ + export function fetchData(): void {} + ''') + result = extract_js(path) + rationale = [n for n in result["nodes"] if n.get("file_type") == "rationale"] + assert any("rate-limits" in n["label"] for n in rationale) + + +def test_js_adr_reference_extracted(tmp_path): + from graphify.extract import extract_js + path = _write_ts(tmp_path, ''' + // Gateway pattern per ADR-0002; provider selection per ADR-0015. + export function route(): void {} + ''') + result = extract_js(path) + refs = [n for n in result["nodes"] if n.get("file_type") == "doc_ref"] + labels = {n["label"] for n in refs} + assert "ADR-0002" in labels and "ADR-0015" in labels + cites = [e for e in result["edges"] if e.get("relation") == "cites"] + assert len(cites) == 2 + + +def test_js_adr_reference_normalized_and_deduped(tmp_path): + from graphify.extract import extract_js + path = _write_ts(tmp_path, ''' + // See ADR-11 for the trust boundary. + // ADR 0011 also governs the injection containment below. + export function guard(): void {} + ''') + result = extract_js(path) + refs = [n for n in result["nodes"] if n.get("file_type") == "doc_ref"] + assert [n["label"] for n in refs] == ["ADR-0011"] + + +def test_js_adr_in_string_literal_not_extracted(tmp_path): + from graphify.extract import extract_js + path = _write_ts(tmp_path, ''' + export const banner = "compliant with ADR-0099"; + ''') + result = extract_js(path) + refs = [n for n in result["nodes"] if n.get("file_type") == "doc_ref"] + assert refs == []