From 974b922b8b12da840736cda005bb8fc4a61e0512 Mon Sep 17 00:00:00 2001 From: Yvette Carlisle Date: Tue, 23 Jun 2026 02:35:05 +0800 Subject: [PATCH] {"schema":"decodex/commit/1","summary":"Add temporal and trajectory adapter evidence coverage","authority":"XY-1070"} --- README.md | 18 +++- .../graphiti_temporal_validity_blocked.json | 52 +++++++++- ...penviking_hierarchy_selection_blocked.json | 21 ++++ ...penviking_recursive_expansion_blocked.json | 23 +++++ .../openviking_staged_retrieval_blocked.json | 21 ++++ .../tests/real_world_job_benchmark.rs | 58 ++++++++++- ...oral-trajectory-adapter-coverage-report.md | 97 +++++++++++++++++++ docs/evidence/benchmarking/index.md | 1 + .../real_world_agent_memory_benchmark.md | 45 ++++++++- .../real_world_agent_memory_benchmark_v1.md | 38 +++++++- scripts/graphiti-zep-docker-temporal-smoke.py | 16 +++ 11 files changed, 379 insertions(+), 11 deletions(-) create mode 100644 docs/evidence/benchmarking/2026-06-23-temporal-trajectory-adapter-coverage-report.md diff --git a/README.md b/README.md index c497e2d9..39fd23b6 100644 --- a/README.md +++ b/README.md @@ -289,6 +289,16 @@ provider-backed ELF evidence was required. exported core block JSON, archival passage/readback/search JSON, and source ids are present. The report makes no hosted mem0 Platform, OpenMemory UI/export, or Letta parity, win, tie, or loss claim. +- Temporal/trajectory adapter coverage after XY-1070: the June 23 follow-up refreshes + Graphiti/Zep temporal-validity and OpenViking context-trajectory evidence. The + Graphiti/Zep blocked fixture now includes current, historical, provider-boundary + source ids plus trace-stage readback, and the generated smoke manifest emits a + temporal-validity scenario row. The OpenViking staged, hierarchy, and recursive + fixtures remain 3 typed blockers with 3 trace-stage artifacts for same-corpus, + missing stage/hierarchy/recursive output, rejected sibling or decoy handling, and + comparison gates. This improves auditability only: no graph-memory parity, + OpenViking trajectory win/tie/loss, hosted Zep, private-corpus, or provider-backed + quality claim is made. - Operator-approved public-proxy addendum after XY-930: the June 19 follow-up runs `cargo make baseline-production-private-addendum` with a simulated/public-proxy production corpus manifest approved for this stage. The run records 12 documents, @@ -424,6 +434,7 @@ Detailed evidence and interpretation: - [P2 Knowledge Workspace PageIndex/OpenKB Closeout Report - June 22, 2026](docs/evidence/benchmarking/2026-06-22-p2-knowledge-workspace-pageindex-openkb-closeout-report.md) - [PageIndex/OpenKB Same-Corpus Adapter Report - June 22, 2026](docs/evidence/benchmarking/2026-06-22-pageindex-openkb-same-corpus-adapter-report.md) - [mem0/OpenMemory and Letta Memory-History/Core-Archive Adapter Report - June 22, 2026](docs/evidence/benchmarking/2026-06-22-mem0-openmemory-letta-memory-history-core-archive-report.md) +- [Temporal and Trajectory Adapter Coverage Report - June 23, 2026](docs/evidence/benchmarking/2026-06-23-temporal-trajectory-adapter-coverage-report.md) - [Live Baseline Benchmark Runbook](docs/runbook/benchmarking/live_baseline_benchmark.md) - [Single-User Production Runbook](docs/runbook/single_user_production.md) - Benchmark contract: @@ -517,6 +528,7 @@ Detailed comparison, mechanism-level analysis, and source map: - [P2 Knowledge Workspace PageIndex/OpenKB Closeout Report - June 22, 2026](docs/evidence/benchmarking/2026-06-22-p2-knowledge-workspace-pageindex-openkb-closeout-report.md) - [PageIndex/OpenKB Same-Corpus Adapter Report - June 22, 2026](docs/evidence/benchmarking/2026-06-22-pageindex-openkb-same-corpus-adapter-report.md) - [mem0/OpenMemory and Letta Memory-History/Core-Archive Adapter Report - June 22, 2026](docs/evidence/benchmarking/2026-06-22-mem0-openmemory-letta-memory-history-core-archive-report.md) +- [Temporal and Trajectory Adapter Coverage Report - June 23, 2026](docs/evidence/benchmarking/2026-06-23-temporal-trajectory-adapter-coverage-report.md) - [Live Baseline Benchmark Runbook](docs/runbook/benchmarking/live_baseline_benchmark.md) - [Real-World Agent Memory Benchmark](docs/runbook/benchmarking/real_world_agent_memory_benchmark.md) - [External Memory Improvement Plan](docs/evidence/external_memory/external_memory_improvement_plan.md) @@ -528,14 +540,14 @@ Detailed comparison, mechanism-level analysis, and source map: - [Derived Knowledge Page Follow-Up Research](docs/research/derived_knowledge_page_followup.md) - [Dreaming Product Surface Follow-Up Research](docs/research/dreaming_product_surface_followup.md) -Latest real-world benchmark report: June 22, 2026. Latest external research refresh: +Latest real-world benchmark report: June 23, 2026. Latest external research refresh: June 11, 2026; June 20 adds the Agent Knowledge OS Closeout Benchmark Report, the Graph Topic-Map Report - June 20, 2026, Knowledge Workspace Version-Diff Report - June 20, 2026, and the Live Knowledge-Page Rebuild/Lint Report - June 20, 2026; June 22 adds the P1 Memory Authority Closeout Report, P2 Knowledge Workspace PageIndex/OpenKB Closeout Report, PageIndex/OpenKB Same-Corpus Adapter -Report, and mem0/OpenMemory and Letta Memory-History/Core-Archive Adapter Report -after the June 19 +Report, and mem0/OpenMemory and Letta Memory-History/Core-Archive Adapter Report; +June 23 adds the Temporal and Trajectory Adapter Coverage Report after the June 19 XY-930 operator-approved public-proxy production addendum and service-native Dreaming readback, the qmd debug-ergonomics Dreaming retest, the June 17 competitor-strength closeout, and the June 16 temporal reconciliation, live consolidation self-check, diff --git a/apps/elf-eval/fixtures/real_world_external_adapters/graph_rag/graphiti_temporal_validity_blocked.json b/apps/elf-eval/fixtures/real_world_external_adapters/graph_rag/graphiti_temporal_validity_blocked.json index 1c649e71..77ae3a47 100644 --- a/apps/elf-eval/fixtures/real_world_external_adapters/graph_rag/graphiti_temporal_validity_blocked.json +++ b/apps/elf-eval/fixtures/real_world_external_adapters/graph_rag/graphiti_temporal_validity_blocked.json @@ -66,7 +66,57 @@ }, "created_at": "2026-06-11T17:17:00Z" } - ] + ], + "adapter_response": { + "adapter_id": "fixture_graphiti_zep_temporal_validity", + "answer": { + "content": "Graphiti/Zep temporal scoring requires current and historical facts with validity windows. The representative adapter output remains blocked until provider-backed temporal search maps those facts to generated evidence ids.", + "claims": [ + { + "claim_id": "graphiti_temporal_contract", + "text": "Graphiti/Zep temporal scoring requires current and historical facts with validity windows.", + "evidence_ids": [ + "graphiti-current-fact-contract", + "graphiti-historical-fact-contract", + "graphiti-provider-boundary" + ], + "confidence": "high" + } + ], + "evidence_ids": [ + "graphiti-current-fact-contract", + "graphiti-historical-fact-contract", + "graphiti-provider-boundary" + ], + "trace_explainability": { + "trace_id": "fixture-graphiti-zep-temporal-validity-blocked", + "failure_stage": "graphiti.provider_boundary", + "failure_reason": "provider_api_key_missing blocks live temporal search output, so the fixture records the current/historical validity-window contract instead of scoring parity.", + "stages": [ + { + "stage_name": "graphiti.validity_window_contract", + "kept_evidence": [ + "graphiti-current-fact-contract", + "graphiti-historical-fact-contract" + ], + "notes": "The typed blocker still names the current and historical source ids required before scoring." + }, + { + "stage_name": "graphiti.provider_boundary", + "kept_evidence": ["graphiti-provider-boundary"], + "notes": "Missing explicit provider configuration is a valid typed blocker, not a failed ELF graph-memory comparison." + } + ] + }, + "latency_ms": 0.0, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + } + } + } }, "timeline": [ { diff --git a/apps/elf-eval/fixtures/real_world_memory/context_trajectory/openviking_hierarchy_selection_blocked.json b/apps/elf-eval/fixtures/real_world_memory/context_trajectory/openviking_hierarchy_selection_blocked.json index 96e48c4e..a18620a1 100644 --- a/apps/elf-eval/fixtures/real_world_memory/context_trajectory/openviking_hierarchy_selection_blocked.json +++ b/apps/elf-eval/fixtures/real_world_memory/context_trajectory/openviking_hierarchy_selection_blocked.json @@ -113,6 +113,27 @@ "amount": 0.0, "input_tokens": 0, "output_tokens": 0 + }, + "trace_explainability": { + "trace_id": "fixture-openviking-hierarchy-selection-blocked", + "failure_stage": "openviking.hierarchy_artifact_gate", + "failure_reason": "Selected parent, child, resource, and rejected sibling evidence is not materialized, so hierarchy selection remains a typed blocker.", + "stages": [ + { + "stage_name": "openviking.same_corpus_gate", + "kept_evidence": ["same-corpus-before-hierarchy"], + "notes": "Hierarchy scoring is gated behind same-corpus expected evidence id coverage." + }, + { + "stage_name": "openviking.hierarchy_artifact_gate", + "kept_evidence": [ + "hierarchy-selection-output-contract", + "hierarchy-comparison-requires-elf-equivalent" + ], + "dropped_evidence": ["hierarchy-design-win-decoy"], + "notes": "The required artifact must show selected hierarchy nodes plus the rejected sibling or decoy context before any ELF/OpenViking comparison is scored." + } + ] } } } diff --git a/apps/elf-eval/fixtures/real_world_memory/context_trajectory/openviking_recursive_expansion_blocked.json b/apps/elf-eval/fixtures/real_world_memory/context_trajectory/openviking_recursive_expansion_blocked.json index 16b41a45..7da28237 100644 --- a/apps/elf-eval/fixtures/real_world_memory/context_trajectory/openviking_recursive_expansion_blocked.json +++ b/apps/elf-eval/fixtures/real_world_memory/context_trajectory/openviking_recursive_expansion_blocked.json @@ -113,6 +113,29 @@ "amount": 0.0, "input_tokens": 0, "output_tokens": 0 + }, + "trace_explainability": { + "trace_id": "fixture-openviking-recursive-expansion-blocked", + "failure_stage": "openviking.recursive_expansion_gate", + "failure_reason": "Seed, expanded child, final evidence, and pruned-branch artifacts are not materialized, so recursive/context expansion remains blocked.", + "stages": [ + { + "stage_name": "openviking.same_corpus_gate", + "kept_evidence": ["recursive-same-corpus-gate"], + "notes": "Recursive expansion scoring remains gated behind expected evidence id coverage." + }, + { + "stage_name": "openviking.recursive_expansion_gate", + "kept_evidence": ["recursive-expansion-output-contract"], + "dropped_evidence": ["recursive-expansion-win-decoy"], + "notes": "The missing expansion-path artifact must show seed context, expanded child contexts, final evidence ids, and pruned branches." + }, + { + "stage_name": "openviking.comparison_gate", + "kept_evidence": ["recursive-elf-comparison-gate"], + "notes": "No ELF tie, win, or loss is allowed until both systems publish comparable expansion-path artifacts for the same scenario." + } + ] } } } diff --git a/apps/elf-eval/fixtures/real_world_memory/context_trajectory/openviking_staged_retrieval_blocked.json b/apps/elf-eval/fixtures/real_world_memory/context_trajectory/openviking_staged_retrieval_blocked.json index b27fedb6..e5823e48 100644 --- a/apps/elf-eval/fixtures/real_world_memory/context_trajectory/openviking_staged_retrieval_blocked.json +++ b/apps/elf-eval/fixtures/real_world_memory/context_trajectory/openviking_staged_retrieval_blocked.json @@ -112,6 +112,27 @@ "amount": 0.0, "input_tokens": 0, "output_tokens": 0 + }, + "trace_explainability": { + "trace_id": "fixture-openviking-staged-retrieval-blocked", + "failure_stage": "openviking.stage_artifact_gate", + "failure_reason": "Stage-level OpenViking trajectory output is not materialized, so the fixture keeps the context-trajectory comparison blocked.", + "stages": [ + { + "stage_name": "openviking.same_corpus_gate", + "kept_evidence": [ + "openviking-evidence-id-output-contract", + "openviking-same-corpus-precondition-blocked" + ], + "notes": "Same-corpus expected, matched, and missing evidence ids must be correct before stage scoring is allowed." + }, + { + "stage_name": "openviking.stage_artifact_gate", + "kept_evidence": ["elf-comparison-requires-comparable-trajectory"], + "dropped_evidence": ["trajectory-win-decoy"], + "notes": "Comparable stage artifacts are missing, and the decoy ELF win claim is explicitly dropped." + } + ] } } } diff --git a/apps/elf-eval/tests/real_world_job_benchmark.rs b/apps/elf-eval/tests/real_world_job_benchmark.rs index e76356be..b96fc9c8 100644 --- a/apps/elf-eval/tests/real_world_job_benchmark.rs +++ b/apps/elf-eval/tests/real_world_job_benchmark.rs @@ -1847,6 +1847,10 @@ fn graph_rag_representative_fixtures_report_typed_non_pass_states() -> Result<() report.pointer("/summary/temporal_validity_not_encoded_count").and_then(Value::as_u64), Some(1) ); + assert_eq!( + report.pointer("/summary/trace_explainability_count").and_then(Value::as_u64), + Some(1) + ); let jobs = array_at(&report, "/jobs")?; let ragflow = find_by_field(jobs, "/job_id", "graph-rag-ragflow-reference-chunks-001")?; @@ -1872,6 +1876,17 @@ fn graph_rag_representative_fixtures_report_typed_non_pass_states() -> Result<() graphiti.pointer("/evolution/temporal_validity_not_encoded").and_then(Value::as_bool), Some(true) ); + assert_eq!( + graphiti.pointer("/trace_explainability/failure_stage").and_then(Value::as_str), + Some("graphiti.provider_boundary") + ); + assert!(array_contains_str(graphiti, "/produced_evidence", "graphiti-current-fact-contract")?); + assert!(array_contains_str( + graphiti, + "/produced_evidence", + "graphiti-historical-fact-contract" + )?); + assert!(array_contains_str(graphiti, "/produced_evidence", "graphiti-provider-boundary")?); assert!(array_contains_str(graphify, "/produced_evidence", "graphify-source-location-output")?); Ok(()) @@ -3383,7 +3398,8 @@ fn assert_qmd_debug_retest_markdown_and_indexes( benchmarking_index.contains("2026-06-19-qmd-debug-ergonomics-dreaming-retest-report.md") ); assert!(readme.contains("qmd Debug-Ergonomics Dreaming Retest Report - June 19, 2026")); - assert!(readme.contains("Latest real-world benchmark report: June 22, 2026")); + assert!(readme.contains("Temporal and Trajectory Adapter Coverage Report - June 23, 2026")); + assert!(readme.contains("Latest real-world benchmark report: June 23, 2026")); assert!(readme.contains("keeps the qmd edge unchanged")); } @@ -7199,6 +7215,10 @@ fn context_trajectory_fixtures_report_blocked_openviking_gates() -> Result<()> { report.pointer("/summary/expected_evidence_recall").and_then(Value::as_f64), Some(1.0) ); + assert_eq!( + report.pointer("/summary/trace_explainability_count").and_then(Value::as_u64), + Some(3) + ); let suites = array_at(&report, "/suites")?; let context = find_by_field(suites, "/suite_id", "context_trajectory")?; @@ -7217,6 +7237,40 @@ fn context_trajectory_fixtures_report_blocked_openviking_gates() -> Result<()> { assert_eq!(staged.pointer("/status").and_then(Value::as_str), Some("blocked")); assert_eq!(hierarchy.pointer("/status").and_then(Value::as_str), Some("blocked")); assert_eq!(recursive.pointer("/status").and_then(Value::as_str), Some("blocked")); + assert_eq!( + staged.pointer("/trace_explainability/failure_stage").and_then(Value::as_str), + Some("openviking.stage_artifact_gate") + ); + assert_eq!( + hierarchy.pointer("/trace_explainability/failure_stage").and_then(Value::as_str), + Some("openviking.hierarchy_artifact_gate") + ); + assert_eq!( + recursive.pointer("/trace_explainability/failure_stage").and_then(Value::as_str), + Some("openviking.recursive_expansion_gate") + ); + + let staged_stages = array_at(staged, "/trace_explainability/stages")?; + let staged_gate = + find_by_field(staged_stages, "/stage_name", "openviking.stage_artifact_gate")?; + + assert!(array_contains_str(staged_gate, "/dropped_evidence", "trajectory-win-decoy")?); + + let hierarchy_stages = array_at(hierarchy, "/trace_explainability/stages")?; + let hierarchy_gate = + find_by_field(hierarchy_stages, "/stage_name", "openviking.hierarchy_artifact_gate")?; + + assert!(array_contains_str(hierarchy_gate, "/dropped_evidence", "hierarchy-design-win-decoy")?); + + let recursive_stages = array_at(recursive, "/trace_explainability/stages")?; + let recursive_gate = + find_by_field(recursive_stages, "/stage_name", "openviking.recursive_expansion_gate")?; + + assert!(array_contains_str( + recursive_gate, + "/dropped_evidence", + "recursive-expansion-win-decoy" + )?); assert!( staged.pointer("/reason").and_then(Value::as_str).is_some_and( |reason| reason.contains("same-corpus output returns expected evidence ids") @@ -7292,7 +7346,7 @@ fn assert_root_aggregate_summary(report: &Value) { assert_eq!(report.pointer("/summary/quote_coverage").and_then(Value::as_f64), Some(1.0)); assert_eq!( report.pointer("/summary/trace_explainability_count").and_then(Value::as_u64), - Some(2) + Some(5) ); assert_eq!( report.pointer("/summary/wrong_result_stage_attribution_count").and_then(Value::as_u64), diff --git a/docs/evidence/benchmarking/2026-06-23-temporal-trajectory-adapter-coverage-report.md b/docs/evidence/benchmarking/2026-06-23-temporal-trajectory-adapter-coverage-report.md new file mode 100644 index 00000000..c5f4d441 --- /dev/null +++ b/docs/evidence/benchmarking/2026-06-23-temporal-trajectory-adapter-coverage-report.md @@ -0,0 +1,97 @@ +--- +type: Evidence +title: "Temporal and Trajectory Adapter Coverage Report - June 23, 2026" +description: "Checked-in benchmark evidence record for XY-1070 Graphiti/Zep temporal and OpenViking context-trajectory adapter coverage." +resource: docs/evidence/benchmarking/2026-06-23-temporal-trajectory-adapter-coverage-report.md +status: active +authority: current_state +owner: evidence +last_verified: 2026-06-23 +tags: + - docs + - evidence + - benchmarking +source_refs: [] +code_refs: + - apps/elf-eval/fixtures/real_world_external_adapters/graph_rag/graphiti_temporal_validity_blocked.json + - apps/elf-eval/fixtures/real_world_memory/context_trajectory/openviking_staged_retrieval_blocked.json + - apps/elf-eval/fixtures/real_world_memory/context_trajectory/openviking_hierarchy_selection_blocked.json + - apps/elf-eval/fixtures/real_world_memory/context_trajectory/openviking_recursive_expansion_blocked.json + - scripts/graphiti-zep-docker-temporal-smoke.py +drift_watch: + - docs/spec/real_world_agent_memory_benchmark_v1.md + - docs/runbook/benchmarking/real_world_agent_memory_benchmark.md +--- +# Temporal and Trajectory Adapter Coverage Report - June 23, 2026 + +Purpose: Record the XY-1070 refresh for Graphiti/Zep temporal-validity and +OpenViking context-trajectory adapter coverage. +Read this when: You need to know whether temporal graph validity or staged context +trajectory evidence changed after the P2 Knowledge Workspace closeout. +Not this document: Broad graph-memory, hosted Zep, or OpenViking parity evidence. +Inputs: Graph/RAG representative fixtures, OpenViking context-trajectory fixtures, +and the Graphiti/Zep Docker temporal smoke materializer. +Outputs: Typed blocker artifacts with current/historical temporal source ids, +stage-level trajectory blockers, and updated benchmark requirements. + +## Judgment + +XY-1070 improves adapter auditability, not competitive status. + +- Graphiti/Zep temporal-validity coverage now includes a checked-in adapter response + for the representative blocked fixture. It names current, historical, and provider + boundary evidence ids, and exposes `graphiti.provider_boundary` as the blocked trace + stage. +- The generated Graphiti/Zep smoke manifest now emits a + `temporal_validity_window_mapping` scenario row. A live pass still requires + provider-backed Graphiti search output that maps current and historical facts to + validity windows and source evidence ids. +- OpenViking staged retrieval, hierarchy selection, and recursive/context expansion + remain typed blockers. Each fixture now exposes trace stages for the same-corpus + gate and the missing stage, hierarchy, rejected sibling/decoy, or recursive + expansion artifact. +- No ELF graph-memory, Graphiti/Zep, hosted Zep, or OpenViking parity, win, tie, or + loss claim is created by this refresh. + +ELF graph-lite remains a derived projection over authoritative source evidence. These +adapter artifacts refine the benchmark and recall-planning trace requirements; they do +not replace memory notes, source refs, or Postgres source-of-truth authority. + +## Command Evidence + +| Command | Result | Evidence | +| --- | --- | --- | +| `jq empty ...graphiti_temporal_validity_blocked.json ...openviking_*.json` | `pass` | All patched JSON fixtures parse. | +| `python3 -m py_compile scripts/graphiti-zep-docker-temporal-smoke.py` | `pass` | The generated manifest scenario change is syntactically valid. | +| `cargo run -p elf-eval --bin real_world_job_benchmark -- run --fixtures apps/elf-eval/fixtures/real_world_memory/context_trajectory --out tmp/real-world-memory/context-trajectory/report.json --run-id real-world-memory-context-trajectory --adapter-id fixture_context_trajectory --adapter-name 'ELF context trajectory fixture'` | `pass` | Report has 3 jobs, 0 pass, 0 wrong_result, 3 blocked, trace explainability count 3, and expected evidence recall 9/9. | +| `cargo run -p elf-eval --bin real_world_job_benchmark -- publish --report tmp/real-world-memory/context-trajectory/report.json --out tmp/real-world-memory/context-trajectory/report.md` | `pass` | Markdown report renders the OpenViking trace-stage blockers. | +| `cargo run -p elf-eval --bin real_world_job_benchmark -- run --fixtures apps/elf-eval/fixtures/real_world_external_adapters/graph_rag --out tmp/real-world-memory/graph-rag/report.json --run-id real-world-memory-graph-rag --adapter-id fixture_graph_rag_external_adapters --adapter-name 'Graph/RAG representative external-adapter fixtures'` | `pass` | Graphiti/Zep job remains `blocked`, has `temporal_validity_not_encoded = true`, produces all three temporal/provider evidence ids, and reports `graphiti.provider_boundary`. | +| `cargo run -p elf-eval --bin real_world_job_benchmark -- publish --report tmp/real-world-memory/graph-rag/report.json --out tmp/real-world-memory/graph-rag/report.md` | `pass` | Markdown report renders the Graphiti/Zep temporal blocker under Trace Explainability. | + +## Scenario Readback + +| Scenario | Current outcome | Materialized readback | Claim boundary | +| --- | --- | --- | --- | +| Graphiti/Zep temporal validity | `blocked` | Current fact contract, historical fact contract, provider boundary, and `graphiti.provider_boundary` trace stage. | No pass until live Graphiti search maps validity windows and source ids. | +| OpenViking staged retrieval trajectory | `blocked` | Same-corpus gate plus missing stage-artifact gate; decoy ELF win evidence is dropped. | No ELF win, tie, or loss until both systems publish comparable stage artifacts. | +| OpenViking hierarchy selection | `blocked` | Same-corpus gate plus hierarchy-artifact gate; selected node and rejected sibling/decoy evidence is required. | OpenViking hierarchy remains a design reference, not a scored comparison. | +| OpenViking recursive/context expansion | `blocked` | Same-corpus gate, recursive expansion gate, comparison gate, and dropped trace-doc decoy. | No ELF tie, win, or loss until comparable expansion-path artifacts exist. | + +## Requirement Refinement + +- Temporal graph validity requires materialized current fact ids, historical fact ids, + validity windows, source ids, and rationale/update evidence, or a typed setup, + runtime, or provider blocker. +- Context trajectory requires stage-level readback for same-corpus coverage, + selected hierarchy nodes, rejected siblings or decoys, expansion paths, pruned + branches, and the comparison gate. +- Recall planning traces should keep blocked, dropped, demoted, distractor, and + not-tested context visible instead of collapsing missing adapter artifacts into a + broad retrieval failure or a false parity claim. + +## Not Claimed + +- No hosted Zep or broad Graphiti/Zep graph-memory quality claim. +- No OpenViking context-trajectory pass, win, tie, or loss. +- No replacement of ELF source authority by graph-lite or external graph output. +- No private-corpus, provider-backed, or large-corpus performance result. diff --git a/docs/evidence/benchmarking/index.md b/docs/evidence/benchmarking/index.md index f38bb93b..83d52cc1 100644 --- a/docs/evidence/benchmarking/index.md +++ b/docs/evidence/benchmarking/index.md @@ -53,3 +53,4 @@ Routes to: Benchmarking evidence concepts under `docs/evidence/benchmarking/`. - `2026-06-22-p2-knowledge-workspace-pageindex-openkb-closeout-report.md`: P2 Knowledge Workspace PageIndex/OpenKB Closeout Report - June 22, 2026; adds `cargo make real-world-memory-p2-knowledge-closeout`, scores the Source Library and Knowledge Workspace fixture slices as pass, preserves PageIndex/OpenKB as `not_tested` reference-only rows, and keeps P3 adapter queueing behind main-thread acceptance. - `2026-06-22-pageindex-openkb-same-corpus-adapter-report.md`: PageIndex/OpenKB Same-Corpus Adapter Report - June 22, 2026; adds `cargo make real-world-memory-pageindex-openkb`, emits checked-in same-corpus typed setup blockers for PageIndex and OpenKB, names source ids and required materialized outputs, and preserves no parity, win, tie, or loss claim. - `2026-06-22-mem0-openmemory-letta-memory-history-core-archive-report.md`: mem0/OpenMemory and Letta Memory-History/Core-Archive Adapter Report - June 22, 2026; adds `cargo make real-world-memory-mem0-openmemory-letta`, maps mem0 SDK history/export outputs to source ids, preserves OpenMemory UI/export as a product blocker, preserves Letta core/archive readback as typed blockers, and makes no hosted/product parity claim. +- `2026-06-23-temporal-trajectory-adapter-coverage-report.md`: Temporal and Trajectory Adapter Coverage Report - June 23, 2026; refreshes Graphiti/Zep temporal-validity and OpenViking context-trajectory adapter evidence with trace-stage typed blockers, source ids, and explicit no-parity boundaries. diff --git a/docs/runbook/benchmarking/real_world_agent_memory_benchmark.md b/docs/runbook/benchmarking/real_world_agent_memory_benchmark.md index d2bd84de..50ee9317 100644 --- a/docs/runbook/benchmarking/real_world_agent_memory_benchmark.md +++ b/docs/runbook/benchmarking/real_world_agent_memory_benchmark.md @@ -73,7 +73,7 @@ compile knowledge, and state honest uncertainty. | Production ops | Backfill, restore, cold start, resource, and bounded-failure behavior. | Resume interrupted import without duplicate source notes. | | Personalization | Scoped preferences without cross-tenant leakage. | Apply the user's current preference and ignore another project's note. | | Core/archival memory | Always-loaded core memory behavior kept separate from archival note search. | Detect a stale core block and fall back to archival evidence. | -| Context trajectory | Staged context trajectory, hierarchy selection, and recursive expansion. | Block OpenViking trajectory scoring until same-corpus evidence ids and comparable stage artifacts exist. | +| Context trajectory | Staged context trajectory, hierarchy selection, rejected sibling/decoy handling, and recursive expansion. | Block OpenViking trajectory scoring until same-corpus evidence ids and comparable stage artifacts exist. | ## External Reference Mapping @@ -185,7 +185,10 @@ including the retrieval-quality slice below. The suite currently encodes: plus archival rationale. - `context_trajectory`: OpenViking staged retrieval, hierarchy selection, and recursive/context expansion jobs encoded as `blocked` until same-corpus expected - evidence ids and comparable stage artifacts are available. + evidence ids and comparable stage artifacts are available. The fixtures expose + stage-level trace readback for the same-corpus gate, missing staged artifact, + selected hierarchy/rejected sibling gate, and recursive expansion/pruned-branch + gate so a blocker is reviewable instead of a prose-only limitation. - `p1_closeout` fixture slice: four jobs across the existing `consolidation`, `memory_evolution`, and `work_resume` suites for Source Library -> Memory Candidate -> approved memory -> recall/debug -> correction/rollback, stale decision @@ -441,6 +444,44 @@ the cases added for current-versus-historical interpretation and temporal stalen The relation temporal-validity fixture is encoded and scores current owner, historical owner, update rationale, and stale-owner trap behavior. +Graphiti/Zep temporal adapter refresh: + +```sh +cargo make smoke-graphiti-zep-docker-temporal +``` + +Default artifacts: + +```text +tmp/real-world-memory/graphiti-zep-smoke/graphiti-zep-report.json +tmp/real-world-memory/graphiti-zep-smoke/graphiti-zep-report.md +tmp/real-world-memory/graphiti-zep-smoke/summary.json +``` + +The default command emits a typed blocker. A live attempt is opt-in: + +```sh +ELF_GRAPHITI_ZEP_SMOKE_START=1 ELF_GRAPHITI_ZEP_SMOKE_RUN=1 cargo make smoke-graphiti-zep-docker-temporal +``` + +The live path can pass only if generated current and historical temporal relation +facts map to validity windows and source evidence ids. Missing provider credentials, +FalkorDB startup failure, Graphiti setup failure, or unmapped validity windows stay +typed as `blocked`, `incomplete`, or `wrong_result`; no hosted Zep or ELF graph-memory +parity claim is allowed from this smoke. + +OpenViking context-trajectory refresh: + +```sh +cargo make real-world-memory-context-trajectory +``` + +The command scores the checked-in staged retrieval, hierarchy selection, and +recursive/context expansion fixtures. Current blocked fixtures include trace-stage +artifacts that name the same-corpus gate, missing stage/hierarchy/recursive artifact, +dropped decoy or rejected sibling evidence, and comparison gate. These trace stages +make the missing artifacts auditable; they do not create an ELF win, tie, or loss. + Current checked-in retrieval-quality increment: ```sh diff --git a/docs/spec/real_world_agent_memory_benchmark_v1.md b/docs/spec/real_world_agent_memory_benchmark_v1.md index 3b48edbf..12b5213f 100644 --- a/docs/spec/real_world_agent_memory_benchmark_v1.md +++ b/docs/spec/real_world_agent_memory_benchmark_v1.md @@ -6,7 +6,7 @@ resource: docs/spec/real_world_agent_memory_benchmark_v1.md status: active authority: normative owner: spec -last_verified: 2026-06-18 +last_verified: 2026-06-23 tags: - docs - spec @@ -463,6 +463,35 @@ Fields: `encoding.status = "not_encoded"` or `encoding.status = "blocked"`. When `encoded = true`, the job is scored normally and must include concrete produced evidence for current and historical validity behavior. + Graphiti/Zep-style temporal jobs MUST NOT set `encoded = true` from a prose + contract alone; they need materialized current fact ids, historical fact ids, + validity windows, source ids, and rationale/update evidence, or an explicit typed + provider/setup blocker. + +### `trace_explainability` + +`corpus.adapter_response.answer.trace_explainability` is optional for most jobs but +SHOULD be present when a fixture's main value is a blocked or wrong-result retrieval +path. It records the stage movement needed to audit the answer without treating hidden +debug notes as evidence. + +Fields: + +- `trace_id`: optional stable fixture or runtime trace handle. +- `failure_stage`: optional stage name that must match one of `stages[].stage_name` + when stages are provided. +- `failure_reason`: optional concise reason for the blocked, incomplete, or + wrong-result stage. +- `stages`: ordered stage records. + +Each `stages[]` record MUST include `stage_name` and MAY include +`kept_evidence`, `dropped_evidence`, `demoted_evidence`, `distractor_evidence`, and +`notes`. Evidence ids in stage arrays MUST refer to corpus items. OpenViking-style +context trajectory jobs SHOULD use trace stages to expose the same-corpus gate, staged +retrieval artifact gate, hierarchy selected-node gate, rejected sibling or decoy +handling, recursive expansion paths, and pruned branches. A blocked trajectory fixture +MUST keep the comparison outcome blocked or not tested; trace stages do not create an +ELF win, tie, or loss claim. ### `operator_debug` @@ -566,7 +595,7 @@ Suite ids are stable public names. Each suite MUST contain at least one | `production_ops` | Prove safe operation under backup, restore, backfill, cold start, resource, and credential boundaries. | Resume interrupted import; restore from backup; report missing private manifest as bounded caveat. | Command/report artifacts, resource envelope, checkpoint state, failure guard evidence. | lifecycle_behavior, latency_resource, uncertainty_handling, evidence_grounding. | ELF, qmd, memsearch, LangGraph. | | `personalization` | Apply user/project preferences correctly without leaking across scopes or overfitting stale preferences. | Remember preferred response style; avoid using another project tenant's note; update a preference. | Scoped memory ids, preference versions, tenant/project/agent context, negative cross-scope traps. | personalization_fit, trap_avoidance, evidence_grounding, answer_correctness. | mem0, Letta, agentmemory, ELF. | | `core_archival_memory` | Verify always-loaded core memory behavior separately from archival note search and derived retrieval indexes. | Read an attached core block; enforce core block scope; detect stale core state from archival evidence; fall back to archival notes; recover a decision from core routing plus archival rationale. | Core block ids, attachment ids, read_profile/scope metadata, source_ref and audit history, archival note evidence ids, stale-core traps, and explicit no-Qdrant-core-block boundary evidence. | answer_correctness, evidence_grounding, trap_avoidance, lifecycle_behavior, workflow_helpfulness. | Letta, ELF. | -| `context_trajectory` | Measure staged context trajectory, hierarchy selection, and recursive/context expansion without converting setup or retrieval preconditions into trajectory wins. | Explain whether a staged trajectory can be scored; identify selected hierarchy nodes; report recursive expansion paths and pruned branches. | Same-corpus expected evidence ids, matched/missing evidence ids, stage artifacts, selected hierarchy nodes, expansion paths, comparable ELF trace/session artifacts when a comparison is claimed. | answer_correctness, evidence_grounding, trap_avoidance, debuggability, workflow_helpfulness. | OpenViking, ELF, qmd. | +| `context_trajectory` | Measure staged context trajectory, hierarchy selection, and recursive/context expansion without converting setup or retrieval preconditions into trajectory wins. | Explain whether a staged trajectory can be scored; identify selected hierarchy nodes; report recursive expansion paths and pruned branches. | Same-corpus expected evidence ids, matched/missing evidence ids, stage artifacts, selected hierarchy nodes, rejected siblings or decoys, expansion paths, pruned branches, comparable ELF trace/session artifacts when a comparison is claimed. | answer_correctness, evidence_grounding, trap_avoidance, debuggability, workflow_helpfulness. | OpenViking, ELF, qmd. | ## Report Semantics @@ -634,7 +663,10 @@ Reports MUST include: Reports that encode `memory_evolution` jobs SHOULD also include stale-answer counts, conflict detection counts, update rationale availability, and temporal-validity `not_encoded` counts. A temporal graph validity job MUST NOT be reported as `pass` -unless the runner can evaluate current-only versus historical relation facts. +unless the runner can evaluate current-only versus historical relation facts with +source ids and validity windows. ELF graph-lite report evidence remains a derived +projection over authoritative sources; external temporal graph adapter evidence MUST +NOT replace ELF source authority. Reports that encode `memory_summary` jobs MUST also include: diff --git a/scripts/graphiti-zep-docker-temporal-smoke.py b/scripts/graphiti-zep-docker-temporal-smoke.py index 065bb78c..ab86e731 100644 --- a/scripts/graphiti-zep-docker-temporal-smoke.py +++ b/scripts/graphiti-zep-docker-temporal-smoke.py @@ -1058,6 +1058,22 @@ def write_manifest(status: StatusState) -> dict[str, Any]: "evidence": "The smoke records setup and provider boundaries but does not encode backup, restore, private corpus, or hosted-service operations.", }, ], + "scenarios": [ + { + "scenario_id": "temporal_validity_window_mapping", + "suite_id": "memory_evolution", + "status": status.result, + "elf_position": "untested", + "comparison_outcome": "blocked" + if status.result == "blocked" + else "not_tested", + "evidence": status.failure_reason + if status.failure_reason + else "Graphiti/Zep temporal search mapped generated current and historical relation facts to validity windows and evidence ids.", + "command": "cargo make smoke-graphiti-zep-docker-temporal", + "artifact": rel(OUT), + } + ], "evidence": [ {"kind": "artifact", "ref": rel(OUT), "status": status.result}, {"kind": "manifest", "ref": rel(MANIFEST_OUT), "status": status.overall},