From 8fbd3dbc709e5356cddd1645fa8e7bff2770d48f Mon Sep 17 00:00:00 2001 From: Yvette Carlisle Date: Fri, 19 Jun 2026 14:44:58 +0800 Subject: [PATCH] {"schema":"decodex/commit/1","summary":"Materialize service-native Dreaming readback","authority":"XY-986"} --- Makefile.toml | 9 + README.md | 16 +- ...ive-dreaming-readback-materialization.json | 324 + ...rvice-native-dreaming-readback-report.json | 5231 +++++++++++++++++ .../src/bin/real_world_live_adapter.rs | 607 +- .../tests/real_world_job_benchmark.rs | 183 + ...service-native-dreaming-readback-report.md | 128 + docs/evidence/benchmarking/index.md | 1 + docs/log.md | 4 + scripts/real-world-docker.sh | 6 + scripts/real-world-dreaming-service-native.sh | 88 + 11 files changed, 6549 insertions(+), 48 deletions(-) create mode 100644 apps/elf-eval/fixtures/report_snapshots/2026-06-19-service-native-dreaming-readback-materialization.json create mode 100644 apps/elf-eval/fixtures/report_snapshots/2026-06-19-service-native-dreaming-readback-report.json create mode 100644 docs/evidence/benchmarking/2026-06-19-service-native-dreaming-readback-report.md create mode 100755 scripts/real-world-dreaming-service-native.sh diff --git a/Makefile.toml b/Makefile.toml index 4505b75e..59c8ed47 100644 --- a/Makefile.toml +++ b/Makefile.toml @@ -59,6 +59,7 @@ # | real-world-memory-scheduled | composite | | # | real-world-memory-scheduled-json | command | | # | real-world-memory-scheduled-report | command | | +# | real-world-memory-service-native-dreaming | command | | # | real-world-memory-summary | composite | | # | real-world-memory-summary-json | command | | # | real-world-memory-summary-report | command | | @@ -865,6 +866,14 @@ args = [ "tmp/real-world-memory/scheduled/report.md", ] +[tasks.real-world-memory-service-native-dreaming] +workspace = false +command = "bash" +args = [ + "scripts/real-world-docker.sh", + "memory-service-native-dreaming", +] + [tasks.real-world-memory-summary] workspace = false dependencies = [ diff --git a/README.md b/README.md index 0c5eb979..7f143161 100644 --- a/README.md +++ b/README.md @@ -199,6 +199,14 @@ provider-backed ELF evidence was required. competitive status unchanged: no ELF-over-Letta win, tie, or loss is allowed until exported Letta core block JSON, archival readback/search JSON, and fixture source ids are present. +- Service-native Dreaming readback after XY-986: the June 19 follow-up adds + `cargo make real-world-memory-service-native-dreaming`, a Docker-contained ELF + service readback command for `memory_summary`, `proactive_brief`, and + `scheduled_memory`. The slice scores 9 pass, 0 wrong_result, and 2 typed XY-930 + private/provider blockers with 22/22 evidence, source-ref, and quote coverage. + This improves local Dreaming runtime authority and auditability, but it does not + prove Pulse, ChatGPT Tasks, Claude Dreams, hosted managed-memory, or private-corpus + parity. - Full-suite live real-world adapter sweep after XY-926: ELF and qmd emit Docker-isolated `live_real_world` records for all 55 checked-in jobs across 13 suites through `cargo make real-world-memory-live-adapters`. Both keep the original @@ -309,6 +317,7 @@ Detailed evidence and interpretation: - [Dreaming Competitor-Strength Retest Report - June 17, 2026](docs/evidence/benchmarking/2026-06-17-dreaming-competitor-strength-retest-report.md) - [qmd Debug-Ergonomics Dreaming Retest Report - June 19, 2026](docs/evidence/benchmarking/2026-06-19-qmd-debug-ergonomics-dreaming-retest-report.md) - [OpenViking Trajectory Materialization Report - June 19, 2026](docs/evidence/benchmarking/2026-06-19-openviking-trajectory-materialization-report.md) +- [Service-Native Dreaming Readback Report - June 19, 2026](docs/evidence/benchmarking/2026-06-19-service-native-dreaming-readback-report.md) - [Live Baseline Benchmark Runbook](docs/runbook/benchmarking/live_baseline_benchmark.md) - [Single-User Production Runbook](docs/runbook/single_user_production.md) - Benchmark contract: @@ -406,9 +415,10 @@ Detailed comparison, mechanism-level analysis, and source map: - [Dreaming Product Surface Follow-Up Research](docs/research/dreaming_product_surface_followup.md) Latest real-world benchmark report: June 19, 2026. Latest external research refresh: -June 11, 2026; June 19 adds the qmd debug-ergonomics Dreaming retest after the June -17 competitor-strength closeout and the June 16 temporal reconciliation, live -consolidation self-check, proactive-brief, and scheduled-memory scoring evidence. +June 11, 2026; June 19 adds service-native Dreaming readback after the qmd +debug-ergonomics Dreaming retest, the June 17 competitor-strength closeout, and the +June 16 temporal reconciliation, live consolidation self-check, proactive-brief, and +scheduled-memory scoring evidence. ## Documentation diff --git a/apps/elf-eval/fixtures/report_snapshots/2026-06-19-service-native-dreaming-readback-materialization.json b/apps/elf-eval/fixtures/report_snapshots/2026-06-19-service-native-dreaming-readback-materialization.json new file mode 100644 index 00000000..0d5d99ae --- /dev/null +++ b/apps/elf-eval/fixtures/report_snapshots/2026-06-19-service-native-dreaming-readback-materialization.json @@ -0,0 +1,324 @@ +{ + "schema": "elf.real_world_live_adapter_materialization/v1", + "adapter_id": "elf_service_native_dreaming", + "adapter_kind": "elf_service_runtime", + "status": "blocked", + "fixtures": "/workspace/tmp/real-world-memory/service-native-dreaming/input-fixtures", + "generated_fixtures": "/workspace/tmp/real-world-memory/service-native-dreaming/elf-fixtures", + "command_evidence": [ + { + "label": "elf_service_runtime", + "status": "blocked", + "command": "cargo run -p elf-eval --bin real_world_live_adapter -- elf", + "artifact": "/workspace/tmp/real-world-memory/service-native-dreaming/elf-materialization.json", + "reason": "ELF live adapter used ElfService, worker indexing, and search_raw." + } + ], + "jobs": [ + { + "job_id": "memory-summary-source-trace-001", + "suite": "memory_summary", + "title": "Read back a reviewable current memory summary with source trace", + "status": "pass", + "query": "Show the current memory summary surface and explain why stale, tombstoned, and unsupported derived memories are not top-of-mind current facts.", + "evidence_ids": [ + "summary-contract-current", + "xy952-summary-contract", + "summary-ttl-tombstone", + "summary-contract-non-parity-boundary" + ], + "returned_count": 5, + "latency_ms": 51.676775, + "trace_id": "2e80669d-2bcf-4238-b780-9b42aa72d2a2", + "failure": null, + "dreaming_readback": { + "artifact_kind": "elf.memory_summary/v1", + "runtime_path": "ElfService::add_note -> ElfService::list -> derived readback artifact", + "service_list_count": 9, + "trace_id": "2e80669d-2bcf-4238-b780-9b42aa72d2a2", + "generated_artifact_count": 1, + "selected_source_refs": [ + "stale-summary-gap", + "summary-background-sot", + "summary-contract-current", + "summary-contract-non-parity-boundary", + "summary-temporary-claim", + "summary-ttl-tombstone", + "superseded-live-evolution-loss", + "xy952-summary-contract" + ], + "missing_source_refs": [], + "source_mutation_count": 0, + "no_source_mutation_checked": true + } + }, + { + "job_id": "proactive-daily-project-brief-001", + "suite": "proactive_brief", + "title": "Generate a daily project brief from current project memory", + "status": "pass", + "query": "Generate a daily project brief with only source-linked current recommendations.", + "evidence_ids": [ + "daily-current-validation-gate", + "daily-current-ledger-update" + ], + "returned_count": 3, + "latency_ms": 6.884306, + "trace_id": "fc854889-2ac4-436b-a885-b43053922cb9", + "failure": null, + "dreaming_readback": { + "artifact_kind": "elf.proactive_project_brief/v1", + "runtime_path": "ElfService::add_note -> ElfService::list -> derived readback artifact", + "service_list_count": 3, + "trace_id": "fc854889-2ac4-436b-a885-b43053922cb9", + "generated_artifact_count": 1, + "selected_source_refs": [ + "daily-current-ledger-update", + "daily-current-validation-gate", + "daily-old-parity-trap" + ], + "missing_source_refs": [], + "source_mutation_count": 0, + "no_source_mutation_checked": true + } + }, + { + "job_id": "proactive-private-corpus-refresh-blocked-001", + "suite": "proactive_brief", + "title": "Block private-corpus refresh suggestions when no operator manifest exists", + "status": "blocked", + "query": "Suggest a private-corpus refresh when private inputs exist.", + "evidence_ids": [], + "returned_count": 0, + "latency_ms": 0.0, + "trace_id": null, + "failure": "No operator-owned private production corpus manifest is available; private-corpus refresh suggestions stay blocked under XY-930." + }, + { + "job_id": "proactive-resume-work-brief-001", + "suite": "proactive_brief", + "title": "Generate a resume-work brief from current handoff memory", + "status": "pass", + "query": "Generate a resume-work brief that identifies the current next action and validation command.", + "evidence_ids": [ + "resume-current-handoff", + "resume-current-validation" + ], + "returned_count": 3, + "latency_ms": 7.336724, + "trace_id": "c77d3ddb-d0c0-4168-a528-a585adfc8a7f", + "failure": null, + "dreaming_readback": { + "artifact_kind": "elf.proactive_project_brief/v1", + "runtime_path": "ElfService::add_note -> ElfService::list -> derived readback artifact", + "service_list_count": 3, + "trace_id": "c77d3ddb-d0c0-4168-a528-a585adfc8a7f", + "generated_artifact_count": 1, + "selected_source_refs": [ + "resume-current-handoff", + "resume-current-validation", + "resume-stale-validation" + ], + "missing_source_refs": [], + "source_mutation_count": 0, + "no_source_mutation_checked": true + } + }, + { + "job_id": "proactive-stale-decision-audit-001", + "suite": "proactive_brief", + "title": "Warn about a stale project decision before suggesting work", + "status": "pass", + "query": "Audit stale project decisions before generating proactive suggestions.", + "evidence_ids": [ + "stale-decision-old-gate", + "stale-decision-new-gate" + ], + "returned_count": 2, + "latency_ms": 9.269810999999999, + "trace_id": "d7decd9a-d635-41b5-9dcc-c6e3c5c44fb7", + "failure": null, + "dreaming_readback": { + "artifact_kind": "elf.proactive_project_brief/v1", + "runtime_path": "ElfService::add_note -> ElfService::list -> derived readback artifact", + "service_list_count": 2, + "trace_id": "d7decd9a-d635-41b5-9dcc-c6e3c5c44fb7", + "generated_artifact_count": 1, + "selected_source_refs": [ + "stale-decision-new-gate", + "stale-decision-old-gate" + ], + "missing_source_refs": [], + "source_mutation_count": 0, + "no_source_mutation_checked": true + } + }, + { + "job_id": "proactive-stale-plan-preference-warning-001", + "suite": "proactive_brief", + "title": "Reject stale plan and preference suggestions after TTL invalidation", + "status": "pass", + "query": "Warn me about stale plans or preferences before making proactive suggestions.", + "evidence_ids": [ + "stale-plan-ttl", + "current-preference-concise-brief" + ], + "returned_count": 5, + "latency_ms": 7.991892, + "trace_id": "f2e795b5-7ac4-4f7d-ab49-75392f6ba8a8", + "failure": null, + "dreaming_readback": { + "artifact_kind": "elf.proactive_project_brief/v1", + "runtime_path": "ElfService::add_note -> ElfService::list -> derived readback artifact", + "service_list_count": 5, + "trace_id": "f2e795b5-7ac4-4f7d-ab49-75392f6ba8a8", + "generated_artifact_count": 1, + "selected_source_refs": [ + "current-plan-run-gate", + "current-preference-concise-brief", + "old-preference-long-brief", + "stale-plan-old", + "stale-plan-ttl" + ], + "missing_source_refs": [], + "source_mutation_count": 0, + "no_source_mutation_checked": true + } + }, + { + "job_id": "scheduled-knowledge-page-refresh-suggestion-001", + "suite": "scheduled_memory", + "title": "Suggest a knowledge-page refresh from scheduled memory", + "status": "pass", + "query": "Run the scheduled knowledge-page refresh suggestion task.", + "evidence_ids": [ + "scheduled-knowledge-page-stale-finding", + "scheduled-knowledge-reviewable-refresh" + ], + "returned_count": 3, + "latency_ms": 6.31843, + "trace_id": "df5b34bc-b8bd-427c-a531-7c37ff2444c8", + "failure": null, + "dreaming_readback": { + "artifact_kind": "elf.scheduled_memory_task/v1", + "runtime_path": "ElfService::add_note -> ElfService::list -> derived readback artifact", + "service_list_count": 3, + "trace_id": "df5b34bc-b8bd-427c-a531-7c37ff2444c8", + "generated_artifact_count": 1, + "selected_source_refs": [ + "scheduled-knowledge-page-stale-finding", + "scheduled-knowledge-reviewable-refresh", + "scheduled-knowledge-silent-rewrite-trap" + ], + "missing_source_refs": [], + "source_mutation_count": 0, + "no_source_mutation_checked": true + } + }, + { + "job_id": "scheduled-private-provider-scheduler-blocked-001", + "suite": "scheduled_memory", + "title": "Block private/provider scheduled tasks without operator inputs", + "status": "blocked", + "query": "Run private/provider scheduled memory tasks when operator inputs exist.", + "evidence_ids": [], + "returned_count": 0, + "latency_ms": 0.0, + "trace_id": null, + "failure": "No operator-owned private production corpus manifest, provider credentials, or hosted scheduler configuration is available; private/provider scheduled tasks stay blocked under XY-930." + }, + { + "job_id": "scheduled-stale-decision-audit-001", + "suite": "scheduled_memory", + "title": "Audit a stale project decision during a scheduled task", + "status": "pass", + "query": "Run the scheduled stale decision audit.", + "evidence_ids": [ + "scheduled-old-consolidation-only-decision", + "scheduled-current-direct-suite-decision" + ], + "returned_count": 2, + "latency_ms": 5.7482619999999995, + "trace_id": "3ca5cf35-007e-4c15-9dce-3983a7053e9a", + "failure": null, + "dreaming_readback": { + "artifact_kind": "elf.scheduled_memory_task/v1", + "runtime_path": "ElfService::add_note -> ElfService::list -> derived readback artifact", + "service_list_count": 2, + "trace_id": "3ca5cf35-007e-4c15-9dce-3983a7053e9a", + "generated_artifact_count": 1, + "selected_source_refs": [ + "scheduled-current-direct-suite-decision", + "scheduled-old-consolidation-only-decision" + ], + "missing_source_refs": [], + "source_mutation_count": 0, + "no_source_mutation_checked": true + } + }, + { + "job_id": "scheduled-stale-preference-plan-audit-001", + "suite": "scheduled_memory", + "title": "Audit stale preferences and plans during a scheduled task", + "status": "pass", + "query": "Run the scheduled stale preference and plan audit.", + "evidence_ids": [ + "scheduled-stale-old-plan", + "scheduled-stale-plan-expired", + "scheduled-current-trace-plan", + "scheduled-current-reviewable-preference" + ], + "returned_count": 5, + "latency_ms": 7.603808, + "trace_id": "8e5741df-c5d5-4e82-a32d-dc8606e8b876", + "failure": null, + "dreaming_readback": { + "artifact_kind": "elf.scheduled_memory_task/v1", + "runtime_path": "ElfService::add_note -> ElfService::list -> derived readback artifact", + "service_list_count": 5, + "trace_id": "8e5741df-c5d5-4e82-a32d-dc8606e8b876", + "generated_artifact_count": 1, + "selected_source_refs": [ + "scheduled-current-reviewable-preference", + "scheduled-current-trace-plan", + "scheduled-old-silent-mutation-preference", + "scheduled-stale-old-plan", + "scheduled-stale-plan-expired" + ], + "missing_source_refs": [], + "source_mutation_count": 0, + "no_source_mutation_checked": true + } + }, + { + "job_id": "scheduled-weekly-project-status-summary-001", + "suite": "scheduled_memory", + "title": "Run a weekly project status summary from current memory", + "status": "pass", + "query": "Run the weekly project status summary scheduled task.", + "evidence_ids": [ + "scheduled-weekly-current-gate", + "scheduled-weekly-ledger-update" + ], + "returned_count": 3, + "latency_ms": 5.362345, + "trace_id": "12bcc69c-4971-4cd5-9f58-16ae45772e7f", + "failure": null, + "dreaming_readback": { + "artifact_kind": "elf.scheduled_memory_task/v1", + "runtime_path": "ElfService::add_note -> ElfService::list -> derived readback artifact", + "service_list_count": 3, + "trace_id": "12bcc69c-4971-4cd5-9f58-16ae45772e7f", + "generated_artifact_count": 1, + "selected_source_refs": [ + "scheduled-weekly-current-gate", + "scheduled-weekly-hosted-parity-trap", + "scheduled-weekly-ledger-update" + ], + "missing_source_refs": [], + "source_mutation_count": 0, + "no_source_mutation_checked": true + } + } + ] +} \ No newline at end of file diff --git a/apps/elf-eval/fixtures/report_snapshots/2026-06-19-service-native-dreaming-readback-report.json b/apps/elf-eval/fixtures/report_snapshots/2026-06-19-service-native-dreaming-readback-report.json new file mode 100644 index 00000000..6513f53c --- /dev/null +++ b/apps/elf-eval/fixtures/report_snapshots/2026-06-19-service-native-dreaming-readback-report.json @@ -0,0 +1,5231 @@ +{ + "schema": "elf.real_world_job_report/v1", + "run_id": "real-world-memory-service-native-dreaming", + "generated_at": "2026-06-19T06:42:28.482226741Z", + "runner_version": "0.2.0-unknown-aarch64-unknown-linux-gnu", + "corpus_profile": "mixed", + "adapter": { + "adapter_id": "elf_service_native_dreaming", + "name": "ELF service-native Dreaming readback adapter", + "behavior": "service_native_dreaming_readback", + "storage": "pass", + "runtime": "pass", + "notes": "Materialized through ElfService add_note/list/search readback for memory_summary, proactive_brief, and scheduled_memory fixtures. Private/provider blockers remain typed non-pass records under XY-930." + }, + "external_adapters": { + "schema": "elf.real_world_external_adapter_report/v1", + "manifest_id": "real-world-memory-project-adapters-2026-06-11-first-generation-continuity-source-store", + "docker_isolation": { + "default": true, + "compose_file": "docker-compose.baseline.yml", + "runner": "scripts/live-baseline-benchmark.sh", + "artifact_dir": "tmp/live-baseline/", + "host_global_installs_required": false, + "notes": [ + "External project runs default to Docker Compose and Docker-managed caches.", + "Real-world job fixture reports and live baseline reports use separate schemas and claim boundaries." + ] + }, + "summary": { + "adapter_count": 23, + "external_project_count": 16, + "docker_default_count": 23, + "host_global_install_required_count": 0, + "fixture_backed_count": 1, + "live_baseline_only_count": 6, + "live_real_world_count": 5, + "research_gate_count": 11, + "overall_status_counts": { + "real": 0, + "mocked": 0, + "unsupported": 0, + "blocked": 7, + "incomplete": 0, + "wrong_result": 6, + "lifecycle_fail": 1, + "pass": 4, + "not_encoded": 5 + }, + "capability_status_counts": { + "real": 8, + "mocked": 1, + "unsupported": 6, + "blocked": 23, + "incomplete": 0, + "wrong_result": 10, + "lifecycle_fail": 0, + "pass": 30, + "not_encoded": 26 + }, + "suite_status_counts": { + "real": 0, + "mocked": 0, + "unsupported": 0, + "blocked": 24, + "incomplete": 0, + "wrong_result": 7, + "lifecycle_fail": 0, + "pass": 27, + "not_encoded": 37 + }, + "scenario_status_counts": { + "real": 0, + "mocked": 0, + "unsupported": 3, + "blocked": 16, + "incomplete": 1, + "wrong_result": 6, + "lifecycle_fail": 1, + "pass": 23, + "not_encoded": 7 + }, + "scenario_position_counts": { + "wins": 10, + "ties": 11, + "loses": 1, + "untested": 35 + }, + "scenario_outcome_counts": { + "win": 10, + "tie": 11, + "loss": 1, + "not_tested": 13, + "blocked": 17, + "non_goal": 5 + } + }, + "adapters": [ + { + "adapter_id": "elf_real_world_memory_fixture", + "project": "ELF", + "adapter_kind": "offline_fixture_response", + "evidence_class": "fixture_backed", + "docker_default": true, + "host_global_installs_required": false, + "overall_status": "blocked", + "setup": { + "status": "pass", + "evidence": "The checked-in real_world_memory fixtures parse and score through the ELF fixture runner.", + "command": "cargo make real-world-memory", + "artifact": "tmp/real-world-memory/real-world-memory-report.json" + }, + "run": { + "status": "blocked", + "evidence": "The current fixture set reports 60 jobs across 16 suites: 53 pass, 0 incomplete, 7 blocked, 0 wrong_result, 0 not_encoded, and 0 unsupported_claim. The six core_archival_memory jobs pass as ELF fixture evidence, not as live Letta comparison evidence; the one memory_summary job passes as fixture-backed source-trace evidence, not as managed-memory parity evidence; the proactive_brief suite scores 4 passing evidence-linked suggestions plus one blocked private-corpus refresh case tied to XY-930, not Pulse or hosted managed-memory parity; the scheduled_memory suite scores 4 passing scheduled readback tasks plus one blocked private/provider scheduler case tied to XY-930, not hosted scheduler, ChatGPT Tasks, Pulse, or provider-backed private-corpus parity; context_trajectory remains blocked behind OpenViking staged-artifact materialization.", + "command": "cargo make real-world-memory", + "artifact": "tmp/real-world-memory/real-world-memory-report.json" + }, + "result": { + "status": "blocked", + "evidence": "This is fixture-backed ELF scoring, not a live external adapter result.", + "artifact": "tmp/real-world-memory/real-world-memory-report.md" + }, + "capabilities": [ + { + "capability": "real_world_job_fixture_scoring", + "status": "real", + "evidence": "The runner scores checked-in real_world_job records with expected evidence, traps, and typed status output." + }, + { + "capability": "live_external_adapter_execution", + "status": "not_encoded", + "evidence": "The ELF fixture response path does not exercise an external memory project runtime." + }, + { + "capability": "docker_isolated_baseline", + "status": "pass", + "evidence": "ELF live baseline runs execute through docker-compose.baseline.yml for retrieval and lifecycle evidence." + } + ], + "suites": [ + { + "suite_id": "trust_source_of_truth", + "status": "pass", + "evidence": "Checked-in source-of-truth rebuild fixture is encoded and passing." + }, + { + "suite_id": "work_resume", + "status": "pass", + "evidence": "Checked-in work-resume fixtures are encoded and passing." + }, + { + "suite_id": "project_decisions", + "status": "pass", + "evidence": "Checked-in project-decision fixtures cover accepted decisions, reversals, current validation gates, rationale, and bounded caveats." + }, + { + "suite_id": "retrieval", + "status": "pass", + "evidence": "Checked-in retrieval fixtures cover alternate phrasing, distractors, multi-hop routing, current-versus-obsolete selection, and minimal context." + }, + { + "suite_id": "memory_evolution", + "status": "pass", + "evidence": "Checked-in memory-evolution fixtures cover current-versus-historical facts and the relation temporal-validity case is encoded." + }, + { + "suite_id": "consolidation", + "status": "pass", + "evidence": "Proposal-only consolidation fixtures are encoded and passing without source mutation." + }, + { + "suite_id": "memory_summary", + "status": "pass", + "evidence": "The source-trace memory summary fixture is encoded and passing with freshness, rationale, tombstone, and unsupported-claim guards." + }, + { + "suite_id": "proactive_brief", + "status": "blocked", + "evidence": "The proactive brief suite scores 4 passing source-linked suggestions and 1 typed private-corpus refresh blocker tied to XY-930." + }, + { + "suite_id": "scheduled_memory", + "status": "blocked", + "evidence": "The scheduled memory suite scores 4 passing source-linked task readbacks with execution trace coverage and 1 typed private/provider scheduler blocker tied to XY-930." + }, + { + "suite_id": "knowledge_compilation", + "status": "pass", + "evidence": "Knowledge page fixtures are encoded and passing with citation and rebuild metrics." + }, + { + "suite_id": "operator_debugging_ux", + "status": "pass", + "evidence": "Operator-debugging fixtures now expose stage attribution and dropped-candidate evidence without raw SQL." + }, + { + "suite_id": "capture_integration", + "status": "pass", + "evidence": "Four redaction, exclusion, source-id, evidence-binding, and capture-boundary fixtures are encoded and passing." + }, + { + "suite_id": "core_archival_memory", + "status": "pass", + "evidence": "Six fixture jobs score core block attachment, scope, provenance, stale-core detection, archival fallback, and project-decision recovery separately from archival note search." + }, + { + "suite_id": "production_ops", + "status": "blocked", + "evidence": "Production-ops fixtures encode restore, Qdrant rebuild, backfill resume, resource-envelope interpretation, OpenViking wrong-result classification, plus typed blocked operator boundaries." + }, + { + "suite_id": "personalization", + "status": "pass", + "evidence": "The scoped preference fixture is encoded and passing." + }, + { + "suite_id": "context_trajectory", + "status": "blocked", + "evidence": "OpenViking staged retrieval, hierarchy selection, and recursive/context expansion fixtures are encoded as blocked until same-corpus evidence ids and staged artifacts are materialized." + } + ], + "scenarios": [], + "evidence": [ + { + "kind": "fixture_dir", + "ref": "apps/elf-eval/fixtures/real_world_memory/", + "status": "real" + }, + { + "kind": "command", + "ref": "cargo make real-world-memory", + "status": "pass" + } + ], + "notes": [ + "This adapter record exists to keep ELF fixture results separate from live external adapter results.", + "The remaining non-pass ELF fixture states are production-ops operator boundaries plus OpenViking context-trajectory measurement gates.", + "Use elf_live_real_world for service-runtime real_world_job evidence; this fixture-backed record must not imply live-service behavior." + ] + }, + { + "adapter_id": "elf_live_real_world", + "project": "ELF", + "adapter_kind": "docker_service_real_world_job", + "evidence_class": "live_real_world", + "docker_default": true, + "host_global_installs_required": false, + "overall_status": "wrong_result", + "setup": { + "status": "pass", + "evidence": "The live adapter task runs inside docker-compose.baseline.yml with Docker-owned Postgres, Qdrant, Cargo, npm, qmd, and cache volumes.", + "command": "cargo make real-world-memory-live-adapters", + "artifact": "tmp/real-world-memory/live-adapters/elf-materialization.json" + }, + "run": { + "status": "wrong_result", + "evidence": "ELF materializes 55 real_world_job adapter_response objects through ElfService, worker indexing, search_raw, live capture/write-policy ingestion, live consolidation proposal review, live knowledge-page rebuild/lint, and operator-debug trace metadata before scoring; the full sweep includes typed wrong_result, blocked, and not_encoded job records.", + "command": "cargo make real-world-memory-live-adapters", + "artifact": "tmp/real-world-memory/live-adapters/elf-report.json" + }, + "result": { + "status": "wrong_result", + "evidence": "The fresh full live sweep scores 55 jobs across all 13 checked-in suites, including live-scored consolidation, knowledge-page, capture/write-policy, and operator-debug suites. This is not a full-suite live pass because memory-evolution, production-ops, core-archival, and context-trajectory gaps remain typed non-pass records.", + "command": "cargo make real-world-memory-live-adapters", + "artifact": "tmp/real-world-memory/live-adapters/elf-report.md" + }, + "capabilities": [ + { + "capability": "real_world_job_adapter", + "status": "pass", + "evidence": "The adapter executes real_world_job prompts after runtime ingestion and writes generated answer artifacts before scoring." + }, + { + "capability": "service_runtime_execution", + "status": "real", + "evidence": "The materializer uses ElfService, Postgres, Qdrant, deterministic providers, worker indexing, and search_raw in Docker." + }, + { + "capability": "targeted_live_pass", + "status": "pass", + "evidence": "The answer-retrieval suites from the original representative slice still pass: work_resume, retrieval, and project_decisions." + }, + { + "capability": "full_suite_live_sweep", + "status": "wrong_result", + "evidence": "The runner now emits per-job and per-suite live records for all 55 checked-in jobs, including the operator-debug fixture tree, but memory_evolution is wrong_result and production/core/context boundaries remain typed non-pass." + }, + { + "capability": "full_suite_live_pass", + "status": "wrong_result", + "evidence": "No full-suite live pass is claimed; generated reports preserve wrong_result, blocked, and not_encoded job outcomes." + }, + { + "capability": "typed_failure_reporting", + "status": "pass", + "evidence": "Adapter setup/runtime limitations are materialized as typed jobs with evidence JSON instead of silent claim upgrades." + } + ], + "suites": [ + { + "suite_id": "trust_source_of_truth", + "status": "pass", + "evidence": "The live adapter retrieved the restore/Qdrant rebuild proof evidence through the service runtime." + }, + { + "suite_id": "work_resume", + "status": "pass", + "evidence": "The live adapter passed 5/5 work_resume jobs through service-runtime evidence retrieval." + }, + { + "suite_id": "retrieval", + "status": "pass", + "evidence": "The live adapter passed 5/5 retrieval jobs through service-runtime evidence retrieval." + }, + { + "suite_id": "project_decisions", + "status": "pass", + "evidence": "The live adapter passed 5/5 project_decisions jobs through service-runtime evidence retrieval." + }, + { + "suite_id": "memory_evolution", + "status": "wrong_result", + "evidence": "The live adapter passed the delete/TTL case but failed five current-versus-historical conflict jobs because retrieval-backed answers did not provide the required historical conflict evidence links." + }, + { + "suite_id": "consolidation", + "status": "pass", + "evidence": "The live adapter creates consolidation runs, materializes proposal jobs through the worker, preserves source lineage and unsupported-claim flags, and applies/defer/discards proposals through review audit transitions." + }, + { + "suite_id": "knowledge_compilation", + "status": "pass", + "evidence": "The live adapter rebuilds derived knowledge pages through ElfService, searches page sections, lints stale source refs after runtime source updates, and emits citation/backlink/unsupported-section page artifacts." + }, + { + "suite_id": "operator_debugging_ux", + "status": "pass", + "evidence": "The full live sweep includes operator_debugging_ux fixtures and emits trace ids, viewer/admin trace-bundle links, replay commands, dropped-candidate visibility, repair-action clarity, and raw_sql_needed=false." + }, + { + "suite_id": "capture_integration", + "status": "pass", + "evidence": "The live adapter passes 4/4 capture_integration jobs through Docker-local ELF ingestion, including capture-boundary classification, excluded evidence ids, source ids in source_ref, write_policy redaction audit counts, evidence binding, and zero secret leakage." + }, + { + "suite_id": "production_ops", + "status": "blocked", + "evidence": "The live adapter sweep does not run backup/restore, private corpus, provider credential, or backfill operations; existing production-ops credential and private-manifest boundaries remain blocked." + }, + { + "suite_id": "personalization", + "status": "pass", + "evidence": "The live adapter retrieved the scoped preference evidence and passed the personalization job." + }, + { + "suite_id": "core_archival_memory", + "status": "not_encoded", + "evidence": "The full live adapter sweep preserves the core/archival fixture gap as typed not_encoded; this issue does not add live core-block attachment/readback materialization." + }, + { + "suite_id": "context_trajectory", + "status": "blocked", + "evidence": "The OpenViking-style context trajectory fixtures remain blocked by live staged-trajectory and recursive-expansion measurement gaps." + } + ], + "scenarios": [ + { + "scenario_id": "live_capture_write_policy", + "suite_id": "capture_integration", + "status": "pass", + "elf_position": "ties", + "comparison_outcome": "tie", + "evidence": "ELF live capture/write-policy jobs pass for redaction, exclusions, source ids, evidence binding, and no secret leakage. This is an ELF self-check, not a win over external hook systems.", + "command": "cargo make real-world-memory-live-adapters", + "artifact": "tmp/real-world-memory/live-adapters/elf-materialization.json" + }, + { + "scenario_id": "live_consolidation_proposal_review", + "suite_id": "consolidation", + "status": "pass", + "elf_position": "ties", + "comparison_outcome": "tie", + "evidence": "ELF live consolidation jobs now exercise source lineage, unsupported-claim flags, and apply/defer/discard review audit transitions. This is an ELF service self-check, not a broad competitor win.", + "command": "cargo make real-world-memory-live-adapters", + "artifact": "tmp/real-world-memory/live-adapters/elf-materialization.json" + }, + { + "scenario_id": "live_knowledge_page_rebuild_lint", + "suite_id": "knowledge_compilation", + "status": "pass", + "elf_position": "ties", + "comparison_outcome": "tie", + "evidence": "ELF live knowledge jobs now exercise page rebuild, search, stale-source lint, citations, backlinks, and unsupported-section handling. This is an ELF service self-check, not a broad knowledge-product win.", + "command": "cargo make real-world-memory-live-adapters", + "artifact": "tmp/real-world-memory/live-adapters/elf-materialization.json" + }, + { + "scenario_id": "full_sweep_operator_debug", + "suite_id": "operator_debugging_ux", + "status": "pass", + "elf_position": "wins", + "comparison_outcome": "win", + "evidence": "ELF full live sweep now includes the operator-debug fixture tree with hydrated trace ids, trace-bundle replay commands, dropped-candidate visibility, repair guidance, and no raw SQL requirement.", + "command": "cargo make real-world-memory-live-adapters", + "artifact": "tmp/real-world-memory/live-adapters/elf-materialization.json" + } + ], + "evidence": [ + { + "kind": "fixture_dir", + "ref": "apps/elf-eval/fixtures/real_world_memory/", + "status": "real" + }, + { + "kind": "fixture_dir", + "ref": "apps/elf-eval/fixtures/real_world_job/operator_debugging_ux/", + "status": "real" + }, + { + "kind": "command", + "ref": "cargo make real-world-memory-live-adapters", + "status": "pass" + }, + { + "kind": "artifact", + "ref": "tmp/real-world-memory/live-adapters/elf-report.json", + "status": "pass" + } + ], + "notes": [ + "This Docker-isolated live real_world_job record now covers the full encoded fixture corpus, not only the original three-suite representative slice.", + "The record is a full-suite sweep, not a full-suite pass; wrong_result, blocked, and not_encoded states remain visible.", + "This record does not prove private-corpus production quality or provider-backed production operations." + ] + }, + { + "adapter_id": "qmd_live_baseline", + "project": "qmd", + "adapter_kind": "docker_cli_same_corpus", + "evidence_class": "live_baseline_only", + "docker_default": true, + "host_global_installs_required": false, + "overall_status": "pass", + "setup": { + "status": "pass", + "evidence": "The live-baseline Docker runner installs qmd inside the baseline container.", + "command": "ELF_BASELINE_PROJECTS=qmd cargo make baseline-live-docker", + "artifact": "tmp/live-baseline/qmd.log" + }, + "run": { + "status": "pass", + "evidence": "qmd same-corpus retrieval, update, delete, and cold-start checks are encoded in the live baseline runner.", + "command": "ELF_BASELINE_PROJECTS=qmd cargo make baseline-live-docker", + "artifact": "tmp/live-baseline/live-baseline-report.json" + }, + "result": { + "status": "pass", + "evidence": "This live_baseline_only record is same-corpus evidence only; cite qmd_live_real_world for the full live real-world sweep.", + "artifact": "docs/runbook/benchmarking/live_baseline_benchmark.md" + }, + "capabilities": [ + { + "capability": "same_corpus_retrieval", + "status": "pass", + "evidence": "qmd has an encoded Docker same-corpus retrieval adapter." + }, + { + "capability": "update_delete_cold_start", + "status": "pass", + "evidence": "qmd lifecycle smoke checks are encoded in the live-baseline runner." + }, + { + "capability": "real_world_job_adapter", + "status": "not_encoded", + "evidence": "This live_baseline_only record does not execute real_world_job prompts; cite qmd_live_real_world for the full live real-world sweep." + } + ], + "suites": [ + { + "suite_id": "retrieval", + "status": "not_encoded", + "evidence": "This live_baseline_only record does not execute real_world_job retrieval prompts; cite qmd_live_real_world for the live retrieval adapter run." + }, + { + "suite_id": "memory_evolution", + "status": "not_encoded", + "evidence": "Live-baseline lifecycle checks exist, but no real_world_job memory_evolution run is encoded." + }, + { + "suite_id": "operator_debugging_ux", + "status": "not_encoded", + "evidence": "qmd debug ergonomics are a reference dimension; no operator_debugging_ux fixture is executed against qmd." + } + ], + "scenarios": [], + "evidence": [ + { + "kind": "runner", + "ref": "scripts/live-baseline-benchmark.sh", + "status": "real" + }, + { + "kind": "compose", + "ref": "docker-compose.baseline.yml", + "status": "real" + } + ], + "notes": [ + "This same-corpus record remains separate from qmd_live_real_world, which records real_world_job prompt execution and scoring evidence." + ] + }, + { + "adapter_id": "qmd_live_real_world", + "project": "qmd", + "adapter_kind": "docker_cli_real_world_job", + "evidence_class": "live_real_world", + "docker_default": true, + "host_global_installs_required": false, + "overall_status": "wrong_result", + "setup": { + "status": "pass", + "evidence": "The live adapter task clones and installs qmd inside the baseline Docker container when the checkout is absent.", + "command": "cargo make real-world-memory-live-adapters", + "artifact": "tmp/real-world-memory/live-adapters/qmd-materialization.json" + }, + "run": { + "status": "wrong_result", + "evidence": "qmd materializes 55 real_world_job adapter_response objects through collection add, update, embed, and query --json before scoring; the full sweep includes typed wrong_result, blocked, and not_encoded job records, with operator-debug fixtures scored through qmd replay metadata rather than ELF trace hydration.", + "command": "cargo make real-world-memory-live-adapters", + "artifact": "tmp/real-world-memory/live-adapters/qmd-report.json" + }, + "result": { + "status": "wrong_result", + "evidence": "The fresh full qmd live sweep scores 55 jobs across all 13 checked-in suites, preserving consolidation, knowledge-page, capture, production-ops, core-archival, and context-trajectory gaps as typed non-pass records. This is not a full-suite live pass.", + "command": "cargo make real-world-memory-live-adapters", + "artifact": "tmp/real-world-memory/live-adapters/qmd-report.md" + }, + "capabilities": [ + { + "capability": "real_world_job_adapter", + "status": "pass", + "evidence": "qmd executes real_world_job prompts through its local CLI retrieval/query workflow and records generated answer artifacts." + }, + { + "capability": "local_cli_retrieval", + "status": "real", + "evidence": "The adapter uses qmd collection add, update, embed -f, and query --json inside Docker." + }, + { + "capability": "targeted_live_pass", + "status": "pass", + "evidence": "The answer-retrieval suites from the original representative slice still pass: work_resume, retrieval, and project_decisions." + }, + { + "capability": "full_suite_live_sweep", + "status": "wrong_result", + "evidence": "The runner now emits per-job and per-suite live records for all 55 checked-in jobs, including the operator-debug fixture tree, but memory_evolution and operator_debugging_ux are wrong_result while non-qmd product surfaces remain typed not_encoded or blocked." + }, + { + "capability": "full_suite_live_pass", + "status": "wrong_result", + "evidence": "No full-suite live pass is claimed; generated reports preserve wrong_result, blocked, and not_encoded job outcomes." + }, + { + "capability": "typed_failure_reporting", + "status": "pass", + "evidence": "qmd setup/runtime limitations are materialized as typed jobs with command evidence and retry artifacts." + } + ], + "suites": [ + { + "suite_id": "trust_source_of_truth", + "status": "pass", + "evidence": "qmd retrieved the restore/Qdrant rebuild proof evidence through the local CLI workflow." + }, + { + "suite_id": "work_resume", + "status": "pass", + "evidence": "qmd passed 5/5 work_resume jobs through CLI evidence retrieval." + }, + { + "suite_id": "retrieval", + "status": "pass", + "evidence": "qmd passed 5/5 retrieval jobs through CLI evidence retrieval." + }, + { + "suite_id": "project_decisions", + "status": "pass", + "evidence": "qmd passed 5/5 project_decisions jobs through CLI evidence retrieval." + }, + { + "suite_id": "memory_evolution", + "status": "wrong_result", + "evidence": "qmd failed all six memory-evolution jobs in the fresh June 11 diagnostic, including the delete/TTL tombstone job where qmd retrieved only the current plan and missed the tombstone evidence." + }, + { + "suite_id": "consolidation", + "status": "not_encoded", + "evidence": "The qmd live adapter sweep retrieves evidence-linked answers but does not generate or review consolidation proposals." + }, + { + "suite_id": "knowledge_compilation", + "status": "not_encoded", + "evidence": "The qmd live adapter sweep retrieves evidence-linked answers but does not generate derived knowledge pages." + }, + { + "suite_id": "operator_debugging_ux", + "status": "wrong_result", + "evidence": "The full qmd live sweep includes operator_debugging_ux fixtures and records replay-command metadata, but it lacks ELF trace hydration, viewer links, and intermediate candidate-drop stages, so the suite remains wrong_result." + }, + { + "suite_id": "capture_integration", + "status": "not_encoded", + "evidence": "The qmd live adapter sweep does not exercise capture integrations or write-policy redaction boundaries; all capture_integration jobs remain typed not_encoded for qmd." + }, + { + "suite_id": "production_ops", + "status": "blocked", + "evidence": "The qmd live adapter sweep does not run backup/restore, private corpus, provider credential, or backfill operations; existing production-ops credential and private-manifest boundaries remain blocked." + }, + { + "suite_id": "personalization", + "status": "pass", + "evidence": "qmd retrieved the scoped preference evidence and passed the personalization job." + }, + { + "suite_id": "core_archival_memory", + "status": "not_encoded", + "evidence": "The qmd live adapter sweep preserves the core/archival fixture gap as typed not_encoded; qmd does not expose ELF core-block attachment/readback materialization." + }, + { + "suite_id": "context_trajectory", + "status": "blocked", + "evidence": "The OpenViking-style context trajectory fixtures remain blocked by live staged-trajectory and recursive-expansion measurement gaps." + } + ], + "scenarios": [], + "evidence": [ + { + "kind": "fixture_dir", + "ref": "apps/elf-eval/fixtures/real_world_memory/", + "status": "real" + }, + { + "kind": "fixture_dir", + "ref": "apps/elf-eval/fixtures/real_world_job/operator_debugging_ux/", + "status": "real" + }, + { + "kind": "command", + "ref": "cargo make real-world-memory-live-adapters", + "status": "pass" + }, + { + "kind": "artifact", + "ref": "tmp/real-world-memory/live-adapters/qmd-report.json", + "status": "pass" + } + ], + "notes": [ + "This qmd record is real-world job evidence and must not be conflated with the same-corpus qmd_live_baseline record.", + "The record is a full-suite sweep, not a full-suite pass; wrong_result, blocked, and not_encoded states remain visible.", + "This record does not prove broad RAG/graph adapter parity or private-corpus production quality." + ] + }, + { + "adapter_id": "elf_operator_debug_live", + "project": "ELF", + "adapter_kind": "docker_service_operator_debug_real_world_job", + "evidence_class": "live_real_world", + "docker_default": true, + "host_global_installs_required": false, + "overall_status": "pass", + "setup": { + "status": "pass", + "evidence": "The narrow operator-debug live task runs inside docker-compose.baseline.yml with Docker-owned Postgres, Qdrant, Cargo, npm, qmd, and cache volumes.", + "command": "cargo make real-world-job-operator-ux-live-adapters", + "artifact": "tmp/real-world-job/operator-ux-live-adapters/elf-materialization.json" + }, + "run": { + "status": "pass", + "evidence": "ELF materializes operator_debugging_ux adapter_response objects through ElfService, worker indexing, search_raw trace ids, and generated operator_debug metadata.", + "command": "cargo make real-world-job-operator-ux-live-adapters", + "artifact": "tmp/real-world-job/operator-ux-live-adapters/elf-report.json" + }, + "result": { + "status": "pass", + "evidence": "The narrow live slice scores operator-debugging jobs with trace availability, replay command availability, candidate-drop visibility, repair-action clarity, and raw-SQL avoidance separated in job-level evidence.", + "command": "cargo make real-world-job-operator-ux-live-adapters", + "artifact": "tmp/real-world-job/operator-ux-live-adapters/elf-report.md" + }, + "capabilities": [ + { + "capability": "operator_debug_real_world_job_adapter", + "status": "pass", + "evidence": "The adapter executes the checked-in operator_debugging_ux jobs through the live service materializer and generated scoring fixtures." + }, + { + "capability": "trace_hydration_metadata", + "status": "pass", + "evidence": "Generated operator_debug records include service trace ids, viewer links, admin trace-bundle URLs, and trace_available=true." + }, + { + "capability": "replay_command_metadata", + "status": "pass", + "evidence": "Generated operator_debug records include admin trace-bundle curl replay commands; no raw SQL path is required." + }, + { + "capability": "candidate_drop_visibility", + "status": "pass", + "evidence": "The operator-debug jobs keep dropped-candidate visibility as explicit job-level evidence instead of relying on direct database inspection." + }, + { + "capability": "openmemory_or_claude_mem_ui_runner", + "status": "not_encoded", + "evidence": "This ELF live slice does not launch OpenMemory or claude-mem UI flows." + } + ], + "suites": [ + { + "suite_id": "operator_debugging_ux", + "status": "pass", + "evidence": "The narrow live operator-debug slice scores trace hydration, stage attribution, candidate-drop visibility, selected-but-not-narrated diagnosis, and repair-action clarity through generated ELF live artifacts." + } + ], + "scenarios": [ + { + "scenario_id": "operator_debug_trace_hydration", + "suite_id": "operator_debugging_ux", + "status": "pass", + "elf_position": "wins", + "comparison_outcome": "win", + "evidence": "ELF generated trace_available=true, service trace ids, viewer URLs, and admin trace-bundle replay URLs for the operator-debug jobs; qmd has replay rows but no ELF trace hydration surface.", + "command": "cargo make real-world-job-operator-ux-live-adapters", + "artifact": "tmp/real-world-job/operator-ux-live-adapters/elf-report.json" + }, + { + "scenario_id": "operator_debug_replay_command", + "suite_id": "operator_debugging_ux", + "status": "pass", + "elf_position": "ties", + "comparison_outcome": "tie", + "evidence": "ELF generated admin trace-bundle replay commands; qmd generated local CLI query replay commands. These are comparable replay-command availability artifacts, not equivalent UI quality claims.", + "command": "cargo make real-world-job-operator-ux-live-adapters", + "artifact": "tmp/real-world-job/operator-ux-live-adapters/summary.json" + }, + { + "scenario_id": "operator_debug_candidate_drop_visibility", + "suite_id": "operator_debugging_ux", + "status": "pass", + "elf_position": "wins", + "comparison_outcome": "win", + "evidence": "ELF generated operator_debug candidate-drop visibility from trace and replay-candidate metadata without direct SQL assumptions; qmd keeps only top-k replay rows and lacks intermediate candidate-drop stages.", + "command": "cargo make real-world-job-operator-ux-live-adapters", + "artifact": "tmp/real-world-job/operator-ux-live-adapters/elf-materialization.json" + }, + { + "scenario_id": "operator_debug_repair_action_clarity", + "suite_id": "operator_debugging_ux", + "status": "pass", + "elf_position": "ties", + "comparison_outcome": "tie", + "evidence": "ELF and qmd generated clear repair/replay steps for the narrow operator-debug jobs; OpenMemory UI/export remains blocked, and claude-mem UI repair paths remain blocked until Docker-contained hook/viewer evidence exists.", + "command": "cargo make real-world-job-operator-ux-live-adapters", + "artifact": "tmp/real-world-job/operator-ux-live-adapters/summary.json" + }, + { + "scenario_id": "operator_debug_selected_but_not_narrated", + "suite_id": "operator_debugging_ux", + "status": "pass", + "elf_position": "wins", + "comparison_outcome": "win", + "evidence": "The new selected-but-not-narrated job scores whether selected trace evidence is available for answer-composition repair without direct database inspection.", + "command": "cargo make real-world-job-operator-ux-live-adapters", + "artifact": "tmp/real-world-job/operator-ux-live-adapters/elf-report.json" + } + ], + "evidence": [ + { + "kind": "fixture_dir", + "ref": "apps/elf-eval/fixtures/real_world_job/operator_debugging_ux/", + "status": "real" + }, + { + "kind": "command", + "ref": "cargo make real-world-job-operator-ux-live-adapters", + "status": "pass" + }, + { + "kind": "artifact", + "ref": "tmp/real-world-job/operator-ux-live-adapters/elf-report.json", + "status": "pass" + } + ], + "notes": [ + "This is a narrow operator-debug live slice, not a full-suite live pass.", + "The record does not implement product UI improvements and does not claim broad qmd/OpenMemory/claude-mem superiority." + ] + }, + { + "adapter_id": "qmd_operator_debug_live", + "project": "qmd", + "adapter_kind": "docker_cli_operator_debug_real_world_job", + "evidence_class": "live_real_world", + "docker_default": true, + "host_global_installs_required": false, + "overall_status": "wrong_result", + "setup": { + "status": "pass", + "evidence": "The narrow operator-debug live task clones and installs qmd inside the baseline Docker container when the checkout is absent.", + "command": "cargo make real-world-job-operator-ux-live-adapters", + "artifact": "tmp/real-world-job/operator-ux-live-adapters/qmd-materialization.json" + }, + "run": { + "status": "wrong_result", + "evidence": "qmd materializes operator_debugging_ux adapter_response objects through collection add, update, embed, and query --json, then records local replay-command metadata but no service trace hydration.", + "command": "cargo make real-world-job-operator-ux-live-adapters", + "artifact": "tmp/real-world-job/operator-ux-live-adapters/qmd-report.json" + }, + "result": { + "status": "wrong_result", + "evidence": "The narrow live slice gives qmd explicit replay-command evidence, but operator-debug jobs remain wrong_result where trace availability, trace completeness, or candidate-drop stage visibility is required.", + "command": "cargo make real-world-job-operator-ux-live-adapters", + "artifact": "tmp/real-world-job/operator-ux-live-adapters/qmd-report.md" + }, + "capabilities": [ + { + "capability": "operator_debug_real_world_job_adapter", + "status": "pass", + "evidence": "The adapter executes the checked-in operator_debugging_ux jobs through qmd local CLI materialization and generated scoring fixtures." + }, + { + "capability": "local_replay_command_metadata", + "status": "pass", + "evidence": "Generated operator_debug records include qmd query replay commands tied to per-job collections." + }, + { + "capability": "trace_hydration_metadata", + "status": "wrong_result", + "evidence": "Generated qmd operator_debug records have trace_available=false and no ELF viewer/admin trace bundle because qmd exposes local replay rows rather than service trace hydration." + }, + { + "capability": "candidate_drop_visibility", + "status": "wrong_result", + "evidence": "qmd top-k replay output is available, but intermediate candidate-drop stages are not exposed in the generated artifact." + }, + { + "capability": "openmemory_or_claude_mem_ui_runner", + "status": "not_encoded", + "evidence": "This qmd live slice does not launch OpenMemory or claude-mem UI flows." + } + ], + "suites": [ + { + "suite_id": "operator_debugging_ux", + "status": "wrong_result", + "evidence": "The narrow qmd operator-debug slice scores local replay commands but remains wrong_result for trace hydration and candidate-drop stage visibility." + } + ], + "scenarios": [ + { + "scenario_id": "operator_debug_trace_hydration", + "suite_id": "operator_debugging_ux", + "status": "wrong_result", + "elf_position": "wins", + "comparison_outcome": "win", + "evidence": "qmd generated replay-command metadata but trace_available=false, so ELF wins only this trace-hydration dimension; this is not a broad qmd loss.", + "command": "cargo make real-world-job-operator-ux-live-adapters", + "artifact": "tmp/real-world-job/operator-ux-live-adapters/qmd-report.json" + }, + { + "scenario_id": "operator_debug_replay_command", + "suite_id": "operator_debugging_ux", + "status": "pass", + "elf_position": "ties", + "comparison_outcome": "tie", + "evidence": "qmd generated local CLI query replay commands for the same operator-debugging scenarios; ELF generated admin trace-bundle curl commands.", + "command": "cargo make real-world-job-operator-ux-live-adapters", + "artifact": "tmp/real-world-job/operator-ux-live-adapters/summary.json" + }, + { + "scenario_id": "operator_debug_candidate_drop_visibility", + "suite_id": "operator_debugging_ux", + "status": "wrong_result", + "elf_position": "wins", + "comparison_outcome": "win", + "evidence": "qmd generated top-k replay output but not intermediate retrieved-but-dropped stage visibility, so candidate-drop diagnosis remains a qmd wrong_result in this narrow slice.", + "command": "cargo make real-world-job-operator-ux-live-adapters", + "artifact": "tmp/real-world-job/operator-ux-live-adapters/qmd-materialization.json" + }, + { + "scenario_id": "operator_debug_repair_action_clarity", + "suite_id": "operator_debugging_ux", + "status": "pass", + "elf_position": "ties", + "comparison_outcome": "tie", + "evidence": "qmd generated clear local replay steps for repair investigation, matching ELF on repair-action clarity while differing on trace hydration.", + "command": "cargo make real-world-job-operator-ux-live-adapters", + "artifact": "tmp/real-world-job/operator-ux-live-adapters/qmd-report.json" + }, + { + "scenario_id": "operator_debug_selected_but_not_narrated", + "suite_id": "operator_debugging_ux", + "status": "wrong_result", + "elf_position": "wins", + "comparison_outcome": "win", + "evidence": "qmd can replay top-k rows, but the generated artifact does not expose service trace narration stages for the selected-but-not-narrated diagnosis.", + "command": "cargo make real-world-job-operator-ux-live-adapters", + "artifact": "tmp/real-world-job/operator-ux-live-adapters/qmd-report.json" + } + ], + "evidence": [ + { + "kind": "fixture_dir", + "ref": "apps/elf-eval/fixtures/real_world_job/operator_debugging_ux/", + "status": "real" + }, + { + "kind": "command", + "ref": "cargo make real-world-job-operator-ux-live-adapters", + "status": "wrong_result" + }, + { + "kind": "artifact", + "ref": "tmp/real-world-job/operator-ux-live-adapters/qmd-report.json", + "status": "wrong_result" + } + ], + "notes": [ + "This is a narrow operator-debug live slice, not a full-suite live pass.", + "qmd's replay-command availability remains useful; the wrong_result status is limited to trace hydration and candidate-drop stage visibility." + ] + }, + { + "adapter_id": "agentmemory_live_baseline", + "project": "agentmemory", + "adapter_kind": "docker_sdk_mock_same_corpus", + "evidence_class": "live_baseline_only", + "docker_default": true, + "host_global_installs_required": false, + "overall_status": "lifecycle_fail", + "setup": { + "status": "pass", + "evidence": "The live-baseline Docker runner installs and exercises agentmemory package APIs.", + "command": "ELF_BASELINE_PROJECTS=agentmemory cargo make baseline-live-docker", + "artifact": "tmp/live-baseline/agentmemory.log" + }, + "run": { + "status": "lifecycle_fail", + "evidence": "Same-corpus retrieval can run, but durable lifecycle behavior is not proven because the adapter uses an in-memory SDK/KV mock.", + "artifact": "tmp/live-baseline/live-baseline-report.json" + }, + "result": { + "status": "lifecycle_fail", + "evidence": "agentmemory remains a reference for capture and continuity UX, but current Docker evidence is not a durable lifecycle pass.", + "artifact": "tmp/live-baseline/live-baseline-report.json" + }, + "capabilities": [ + { + "capability": "same_corpus_retrieval", + "status": "pass", + "evidence": "The current adapter can run mem::remember and mem::search against the shared corpus." + }, + { + "capability": "adapter_storage", + "status": "mocked", + "evidence": "The current adapter uses a process-local StateKV Map and in-memory index." + }, + { + "capability": "durable_cold_start", + "status": "blocked", + "evidence": "A persistent upstream KV/index path or hosted runtime is needed before cold-start recovery can be fairly scored." + }, + { + "capability": "durable_work_resume_capture_path", + "status": "blocked", + "evidence": "XY-925 selects the next local path as a Docker-contained agentmemory session directory with persisted SDK KV store, observation log, and searchable index across a fresh process; the current StateKV Map and in-memory index still block scoring." + }, + { + "capability": "write_policy_hook_capture", + "status": "blocked", + "evidence": "Capture/write-policy jobs require live agentmemory hook observations plus persisted write-policy audit evidence. The current adapter does not execute those hooks." + }, + { + "capability": "real_world_job_adapter", + "status": "blocked", + "evidence": "XY-925 adds fixture-backed blocked prompt coverage for the required durable path, but no live agentmemory real_world_job adapter executes prompts until the persistent local store exists." + } + ], + "suites": [ + { + "suite_id": "work_resume", + "status": "blocked", + "evidence": "A durable upstream agentmemory session/capture path is required before work-resume jobs can be compared fairly." + }, + { + "suite_id": "capture_integration", + "status": "blocked", + "evidence": "The current fixture import boundary is offline and does not run live agentmemory hooks." + }, + { + "suite_id": "memory_evolution", + "status": "blocked", + "evidence": "Durable update/supersede/delete history is not proven by the in-memory adapter." + } + ], + "scenarios": [ + { + "scenario_id": "basic_same_corpus_retrieval", + "suite_id": "retrieval", + "status": "pass", + "elf_position": "untested", + "comparison_outcome": "not_tested", + "evidence": "Fresh comparable baseline run live-baseline-20260611061612 reports agentmemory retrieval_pass with 3/3 same-corpus retrieval checks through mem::remember and mem::search. This is live-baseline-only evidence through an in-memory mock, not a real_world_job suite pass.", + "command": "ELF_BASELINE_PROJECTS=ELF,agentmemory,mem0,memsearch,claude-mem cargo make baseline-live-docker", + "artifact": "tmp/live-baseline/live-baseline-report.json" + }, + { + "scenario_id": "durable_update_reload_lifecycle", + "suite_id": "memory_evolution", + "status": "lifecycle_fail", + "elf_position": "wins", + "comparison_outcome": "win", + "evidence": "Fresh comparable baseline run live-baseline-20260611061612 reports ELF passing 8/8 local lifecycle checks, while agentmemory update_replaces_note_text is lifecycle_fail and cold_start_recovery_search is blocked because the harness uses an in-memory SDK/KV mock. This is an ELF baseline win only at the local lifecycle-smoke evidence class.", + "command": "ELF_BASELINE_PROJECTS=ELF,agentmemory,mem0,memsearch,claude-mem cargo make baseline-live-docker", + "artifact": "tmp/live-baseline/live-baseline-report.json" + }, + { + "scenario_id": "work_resume_capture_continuity", + "suite_id": "work_resume", + "status": "blocked", + "elf_position": "untested", + "comparison_outcome": "blocked", + "evidence": "agentmemory's relevant strength is durable coding-agent continuity and capture, but the Docker harness has not proven a persistent session/capture path. XY-925 selects the durable local path as a Docker-contained session directory that persists the SDK KV store and searchable index across a fresh process; keep work_resume and capture claims blocked until that path exists.", + "command": "cargo make real-world-first-generation-oss", + "artifact": "tmp/real-world-memory/first-generation-oss/report.json" + }, + { + "scenario_id": "durable_work_resume_local_path", + "suite_id": "work_resume", + "status": "blocked", + "elf_position": "untested", + "comparison_outcome": "blocked", + "evidence": "The selected comparable path is explicit: capture into a Docker-local agentmemory session directory, persist the SDK KV/index and observation log, restart a fresh process, then score work_resume prompts. The checked-in fixture records this as blocked rather than scoring the current mock.", + "command": "cargo make real-world-first-generation-oss", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/first_generation_oss/agentmemory_durable_capture_path_blocked.json" + }, + { + "scenario_id": "capture_write_policy_hooks", + "suite_id": "capture_integration", + "status": "blocked", + "elf_position": "untested", + "comparison_outcome": "blocked", + "evidence": "agentmemory capture/write-policy comparison needs live hook observations and write-policy audit evidence persisted through the selected local store. The fixture preserves this as a typed blocker and does not convert the mem::remember smoke into capture proof.", + "command": "cargo make real-world-first-generation-oss", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/first_generation_oss/agentmemory_durable_capture_path_blocked.json" + } + ], + "evidence": [ + { + "kind": "evidence", + "ref": "docs/evidence/external_memory/agentmemory_adapter.md", + "status": "real" + }, + { + "kind": "runner", + "ref": "scripts/live-baseline-benchmark.sh", + "status": "mocked" + } + ], + "notes": [ + "The offline agentmemory fixture adapter is an import/comparison boundary and must not be treated as live benchmark proof." + ], + "follow_up": { + "title": "[ELF benchmark P0] Make agentmemory adapter lifecycle-durable and fail-typed", + "reason": "A durable upstream agentmemory storage path is required before lifecycle and real-world job suites can be fairly scored." + } + }, + { + "adapter_id": "mem0_openmemory_live_baseline", + "project": "mem0/OpenMemory", + "adapter_kind": "docker_sdk_same_corpus", + "evidence_class": "live_baseline_only", + "docker_default": true, + "host_global_installs_required": false, + "overall_status": "pass", + "setup": { + "status": "pass", + "evidence": "The live-baseline Docker runner can install mem0 and configure local FastEmbed/Qdrant paths.", + "command": "ELF_BASELINE_PROJECTS=mem0 cargo make baseline-live-docker", + "artifact": "tmp/live-baseline/mem0.log" + }, + "run": { + "status": "pass", + "evidence": "Fresh scoped baseline run live-baseline-20260611122416 exercises local OSS mem0 with FastEmbed, Qdrant path storage, Memory.update, Memory.delete, Memory.history, Memory.get_all, entity filters, and cold-start reload; mem0 passed 8/8 encoded SDK checks. XY-931 adds a separate OpenMemory export-helper setup probe artifact and keeps that blocked UI/export result out of the SDK check summary.", + "command": "cargo make openmemory-ui-export-readback", + "artifact": "tmp/live-baseline/live-baseline-report.json" + }, + "result": { + "status": "pass", + "evidence": "The local OSS mem0 baseline now passes same-corpus retrieval, update/delete/reload, preference correction history, entity-scoped personalization, local get_all export-style readback, and deletion audit history. The separate OpenMemory export-helper setup probe is blocked because Docker is unavailable inside the baseline-runner container before any product app database readback can run. It still does not claim hosted Platform export, optional graph memory, or a real_world_job prompt adapter.", + "artifact": "tmp/live-baseline/live-baseline-report.json" + }, + "capabilities": [ + { + "capability": "local_storage", + "status": "real", + "evidence": "The adapter targets local FastEmbed, Qdrant path storage, and local history DB paths in Docker." + }, + { + "capability": "same_corpus_retrieval", + "status": "pass", + "evidence": "Fresh scoped baseline run live-baseline-20260611122416 reports mem0 retrieval_pass with 3/3 same-corpus retrieval checks." + }, + { + "capability": "local_lifecycle_update_delete_reload", + "status": "pass", + "evidence": "The Docker runner exercises public Memory.update, Memory.delete, and a new Memory.from_config over the same local Qdrant/history paths; the fresh scoped run reports those lifecycle checks passing." + }, + { + "capability": "preference_correction_history", + "status": "pass", + "evidence": "The fresh scoped run reports preference_correction_history as pass: Memory.history preserved explicit ADD and UPDATE records with old and current preference text, and search returned only the current correction." + }, + { + "capability": "entity_scoped_personalization", + "status": "pass", + "evidence": "The fresh scoped run reports entity_scoped_personalization as pass: user_id, agent_id, and run_id filters returned the ELF scoped preference and omitted a PubFi scoped preference." + }, + { + "capability": "local_get_all_export_readback", + "status": "pass", + "evidence": "The fresh scoped run reports local_get_all_export_readback as pass: Memory.get_all returned the current scoped preference and omitted the other scope." + }, + { + "capability": "deletion_audit_history", + "status": "pass", + "evidence": "The fresh scoped run reports delete_history_audit_readback as pass: Memory.history exposed a DELETE event and search suppressed the deleted memory." + }, + { + "capability": "openmemory_ui_readback", + "status": "blocked", + "evidence": "XY-931 runs a bounded OpenMemory export-helper setup probe after the mem0 SDK corpus checks. The probe finds the OpenMemory tree, UI package, compose file, and export helper, then records a setup blocker because the export helper requires Docker access to a running OpenMemory container. Local SDK get_all readback is measured separately and must not be reused as UI evidence." + }, + { + "capability": "hosted_managed_memory_claims", + "status": "unsupported", + "evidence": "Hosted mem0 Platform behavior and Platform UI export are outside the local OSS Docker adapter and are non-goals for this local evidence record." + }, + { + "capability": "real_world_job_adapter", + "status": "not_encoded", + "evidence": "No mem0/OpenMemory adapter currently executes real_world_job prompts and answer scoring." + }, + { + "capability": "optional_graph_memory", + "status": "not_encoded", + "evidence": "Optional graph memory is not enabled in the default local OSS path and remains an opt-in scenario gate rather than a default pass/fail claim." + } + ], + "suites": [ + { + "suite_id": "memory_evolution", + "status": "not_encoded", + "evidence": "Scenario-level local OSS checks now measure preference correction history and deletion audit readback, but no mem0 real_world_job memory_evolution prompt adapter is encoded." + }, + { + "suite_id": "personalization", + "status": "not_encoded", + "evidence": "Scenario-level local OSS checks now measure entity-scoped personalization, but no mem0 real_world_job personalization prompt adapter is encoded." + }, + { + "suite_id": "operator_debugging_ux", + "status": "blocked", + "evidence": "Local SDK get_all inspection is measured, but OpenMemory UI/export readback is blocked by the XY-931 export-helper setup probe until a dedicated OpenMemory compose/import path can load the same corpus into the OpenMemory app database." + } + ], + "scenarios": [ + { + "scenario_id": "basic_local_lifecycle", + "suite_id": "memory_evolution", + "status": "pass", + "elf_position": "ties", + "comparison_outcome": "tie", + "evidence": "Prior comparable baseline run live-baseline-20260611061612 reports ELF passing 8/8 local lifecycle checks and mem0 passing basic same-corpus retrieval, update, delete, and cold-start reload checks. This remains a basic local lifecycle tie at the encoded smoke surface and is not reused as history/UI evidence.", + "command": "ELF_BASELINE_PROJECTS=ELF,agentmemory,mem0,memsearch,claude-mem cargo make baseline-live-docker", + "artifact": "tmp/live-baseline/live-baseline-report.json" + }, + { + "scenario_id": "preference_correction_history", + "suite_id": "personalization", + "status": "pass", + "elf_position": "loses", + "comparison_outcome": "loss", + "evidence": "Fresh scoped baseline run live-baseline-20260611122416 reports mem0 preference_correction_history as pass. ELF-side evidence comes from cargo make real-world-memory-live-adapters as summarized in docs/evidence/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md, which records ELF live memory-evolution preference as wrong_result. The current measured comparison is therefore an ELF loss on this history dimension until ELF temporal reconciliation is fixed.", + "command": "mem0: ELF_BASELINE_PROJECTS=mem0 cargo make baseline-live-docker; ELF: cargo make real-world-memory-live-adapters", + "artifact": "mem0: tmp/live-baseline/mem0-checks.json; ELF: tmp/real-world-memory/live-adapters/ and docs/evidence/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md" + }, + { + "scenario_id": "entity_scoped_personalization", + "suite_id": "personalization", + "status": "pass", + "elf_position": "ties", + "comparison_outcome": "tie", + "evidence": "Fresh scoped baseline run live-baseline-20260611122416 reports mem0 entity_scoped_personalization as pass. ELF-side evidence comes from cargo make real-world-memory-live-adapters as summarized in docs/evidence/benchmarking/2026-06-11-competitor-strength-adoption-report.md, which records ELF and qmd passing the encoded personalization slice. This is a measured tie on the current scoped-preference surface.", + "command": "mem0: ELF_BASELINE_PROJECTS=mem0 cargo make baseline-live-docker; ELF: cargo make real-world-memory-live-adapters", + "artifact": "mem0: tmp/live-baseline/mem0-checks.json; ELF: tmp/real-world-memory/live-adapters/ and docs/evidence/benchmarking/2026-06-11-competitor-strength-adoption-report.md" + }, + { + "scenario_id": "delete_audit_readback", + "suite_id": "memory_evolution", + "status": "pass", + "elf_position": "ties", + "comparison_outcome": "tie", + "evidence": "Fresh scoped baseline run live-baseline-20260611122416 reports mem0 delete_history_audit_readback as pass. ELF-side evidence comes from cargo make real-world-memory-live-adapters as summarized in docs/evidence/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md, which records ELF passing the delete/TTL tombstone job. The current measured delete-audit comparison is a tie.", + "command": "mem0: ELF_BASELINE_PROJECTS=mem0 cargo make baseline-live-docker; ELF: cargo make real-world-memory-live-adapters", + "artifact": "mem0: tmp/live-baseline/mem0-checks.json; ELF: tmp/real-world-memory/live-adapters/ and docs/evidence/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md" + }, + { + "scenario_id": "local_get_all_export_readback", + "suite_id": "operator_debugging_ux", + "status": "pass", + "elf_position": "untested", + "comparison_outcome": "not_tested", + "evidence": "Fresh scoped baseline run live-baseline-20260611122416 reports mem0 local_get_all_export_readback as pass. This is local SDK inspection/export-style readback, not OpenMemory UI evidence; ELF has no directly comparable live UI/export scoring row in this run.", + "command": "ELF_BASELINE_PROJECTS=mem0 cargo make baseline-live-docker", + "artifact": "tmp/live-baseline/mem0-checks.json" + }, + { + "scenario_id": "openmemory_ui_export_readback", + "suite_id": "operator_debugging_ux", + "status": "blocked", + "elf_position": "untested", + "comparison_outcome": "blocked", + "evidence": "The XY-931 OpenMemory export-helper setup probe is Docker-contained in the mem0 baseline run. It detects the OpenMemory product tree, UI package, compose file, and export helper, but Docker is unavailable inside the baseline-runner container before the helper can reach a running OpenMemory product container or app database. Basic lifecycle and local SDK get_all readback are not reused as UI/export proof.", + "command": "cargo make openmemory-ui-export-readback", + "artifact": "tmp/live-baseline/mem0-openmemory-ui-export.json" + }, + { + "scenario_id": "hosted_platform_export", + "suite_id": "operator_debugging_ux", + "status": "unsupported", + "elf_position": "untested", + "comparison_outcome": "non_goal", + "evidence": "Hosted mem0 Platform export is explicitly outside the local OSS Docker comparison and is not counted as a local pass, loss, or blocker.", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json" + }, + { + "scenario_id": "optional_graph_memory", + "suite_id": "memory_evolution", + "status": "not_encoded", + "elf_position": "untested", + "comparison_outcome": "non_goal", + "evidence": "Optional graph memory is kept as an opt-in scenario gate. It is not enabled in the default mem0 local OSS run and is not part of the default pass/fail comparison.", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json" + } + ], + "evidence": [ + { + "kind": "runner", + "ref": "scripts/live-baseline-benchmark.sh", + "status": "real" + } + ], + "notes": [ + "Separate local OSS mem0 SDK evidence from OpenMemory product UI/export claims.", + "A blocked OpenMemory export-helper setup probe is not an ELF win or loss until the product app can import and export the same local corpus." + ] + }, + { + "adapter_id": "memsearch_live_baseline", + "project": "memsearch", + "adapter_kind": "docker_cli_same_corpus", + "evidence_class": "live_baseline_only", + "docker_default": true, + "host_global_installs_required": false, + "overall_status": "pass", + "setup": { + "status": "pass", + "evidence": "The live-baseline Docker runner can install memsearch and run its CLI path.", + "command": "ELF_BASELINE_PROJECTS=memsearch cargo make baseline-live-docker", + "artifact": "tmp/live-baseline/memsearch.log" + }, + "run": { + "status": "pass", + "evidence": "Fresh comparable baseline run live-baseline-20260611061612 indexes a per-adapter corpus copy, rewrites and deletes files, reruns memsearch index, and reports memsearch 4/4 encoded checks passing.", + "artifact": "tmp/live-baseline/live-baseline-report.json" + }, + "result": { + "status": "pass", + "evidence": "memsearch now passes the local same-corpus/reindex/update/delete/reload smoke. No real_world_job memsearch prompt adapter is encoded, so Markdown-first behavior remains baseline scenario evidence rather than suite pass evidence.", + "artifact": "tmp/live-baseline/live-baseline-report.json" + }, + "capabilities": [ + { + "capability": "canonical_markdown_store", + "status": "real", + "evidence": "memsearch is tracked as a Markdown-first source-of-truth reference." + }, + { + "capability": "same_corpus_retrieval", + "status": "pass", + "evidence": "Fresh comparable baseline run live-baseline-20260611061612 reports memsearch retrieval_pass with 3/3 same-corpus retrieval checks." + }, + { + "capability": "reindex_update_delete_reload", + "status": "pass", + "evidence": "The runner rewrites auth-memory.md, deletes a second corpus file, reruns memsearch index, and starts fresh memsearch search processes; the fresh scoped run reports update, delete, and cold-start reload passing." + }, + { + "capability": "real_world_job_adapter", + "status": "not_encoded", + "evidence": "XY-925 adds fixture-backed prompt coverage for the Markdown source-store and retrieval-debug jobs, but no live memsearch runtime adapter executes real_world_job prompts and answer scoring." + }, + { + "capability": "markdown_source_store_prompt_jobs", + "status": "pass", + "evidence": "The first-generation OSS fixture slice encodes source-of-truth rebuild/reload and retrieval-debug prompts over the canonical Markdown store while preserving the live-baseline-only evidence boundary." + } + ], + "suites": [ + { + "suite_id": "trust_source_of_truth", + "status": "not_encoded", + "evidence": "The Markdown-first source model passed the local reindex/reload smoke, and XY-925 adds fixture-backed source-of-truth prompt coverage over the canonical Markdown store. No live memsearch runtime adapter executes prompt scoring yet, so this is not a suite pass." + }, + { + "suite_id": "retrieval", + "status": "not_encoded", + "evidence": "The Docker same-corpus check passes, and XY-925 adds fixture-backed retrieval-debug prompt coverage over memsearch CLI replay and Markdown source inspection. No live memsearch runtime adapter executes retrieval prompt scoring yet, so this is not a suite pass." + }, + { + "suite_id": "memory_evolution", + "status": "not_encoded", + "evidence": "Update/delete reindex semantics pass in Docker, but memory_evolution real_world_job prompts are not encoded for memsearch." + } + ], + "scenarios": [ + { + "scenario_id": "canonical_markdown_reindex_reload", + "suite_id": "trust_source_of_truth", + "status": "pass", + "elf_position": "untested", + "comparison_outcome": "not_tested", + "evidence": "Fresh comparable baseline run live-baseline-20260611061612 reports memsearch passed same-corpus retrieval, update reindex, delete suppression, and cold-start reload over a canonical Markdown corpus. ELF has no directly comparable canonical Markdown source-store scenario in this baseline, so the ELF position remains untested.", + "command": "ELF_BASELINE_PROJECTS=ELF,agentmemory,mem0,memsearch,claude-mem cargo make baseline-live-docker", + "artifact": "tmp/live-baseline/live-baseline-report.json" + }, + { + "scenario_id": "markdown_source_store_rebuild_reload_prompt", + "suite_id": "trust_source_of_truth", + "status": "pass", + "elf_position": "untested", + "comparison_outcome": "not_tested", + "evidence": "XY-925 adds a checked-in real_world_job prompt fixture that asks for the memsearch source-of-truth path and rebuild/reload boundary: canonical Markdown files are authoritative, while the index is derived by rerunning memsearch index. This is fixture-backed scenario coverage plus baseline artifact evidence, not a memsearch live real_world_job suite pass.", + "command": "cargo make real-world-first-generation-oss", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/first_generation_oss/memsearch_markdown_rebuild_reload.json" + }, + { + "scenario_id": "markdown_retrieval_debug_prompt", + "suite_id": "operator_debugging_ux", + "status": "pass", + "elf_position": "untested", + "comparison_outcome": "not_tested", + "evidence": "XY-925 adds a checked-in retrieval-debug prompt over memsearch's canonical Markdown store. The expected debug surface is CLI replay plus Markdown source inspection and reindexing; staged expansion/fusion/rerank/candidate-drop trace bundles remain not encoded for memsearch.", + "command": "cargo make real-world-first-generation-oss", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/first_generation_oss/memsearch_retrieval_debug_prompt.json" + }, + { + "scenario_id": "ttl_expiry_lifecycle", + "suite_id": "memory_evolution", + "status": "unsupported", + "elf_position": "untested", + "comparison_outcome": "non_goal", + "evidence": "The encoded memsearch CLI path supports reindex/delete but no TTL or expiry behavior. Unsupported TTL behavior is preserved as unsupported competitor evidence and does not create an ELF win/loss claim without a directly comparable scenario artifact.", + "artifact": "tmp/live-baseline/live-baseline-report.json" + }, + { + "scenario_id": "real_world_prompt_adapter", + "suite_id": "retrieval", + "status": "not_encoded", + "elf_position": "untested", + "comparison_outcome": "not_tested", + "evidence": "No live memsearch runtime adapter currently executes real_world_job prompts and answer scoring. XY-925 fixture-backed prompt jobs document the source-store and retrieval-debug shape, while baseline retrieval/reindex evidence remains separate from suite pass claims.", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json" + } + ], + "evidence": [ + { + "kind": "runner", + "ref": "scripts/live-baseline-benchmark.sh", + "status": "real" + } + ], + "notes": [ + "Do not mark memsearch worse solely because setup or local indexing is heavier; preserve the typed incomplete/wrong-result boundary." + ] + }, + { + "adapter_id": "openviking_live_baseline", + "project": "OpenViking", + "adapter_kind": "docker_local_embed_same_corpus", + "evidence_class": "live_baseline_only", + "docker_default": true, + "host_global_installs_required": false, + "overall_status": "wrong_result", + "setup": { + "status": "pass", + "evidence": "OpenViking local-embed setup installed and imported pinned llama-cpp-python==0.3.28 from the CPU wheel index in Docker.", + "command": "ELF_BASELINE_PROJECTS=OpenViking cargo make baseline-live-docker", + "artifact": "tmp/live-baseline/OpenViking.log" + }, + "run": { + "status": "wrong_result", + "evidence": "The adapter reached same-corpus add_resource/find and now exposes expected/matched/missing evidence ids, but returned 0 of 3 expected evidence-term matches in the smoke run.", + "artifact": "tmp/live-baseline/live-baseline-report.json" + }, + "result": { + "status": "wrong_result", + "evidence": "The current OpenViking Docker evidence is a behavioral wrong_result, not a local embedding setup blocker and not a real_world_job pass.", + "artifact": "docs/runbook/benchmarking/live_baseline_benchmark.md" + }, + "capabilities": [ + { + "capability": "local_embed_setup", + "status": "pass", + "evidence": "Docker local embedding dependency setup is pinned to llama-cpp-python==0.3.28 from https://abetlen.github.io/llama-cpp-python/whl/cpu and reached import/runtime in the smoke run." + }, + { + "capability": "same_corpus_retrieval", + "status": "wrong_result", + "evidence": "OpenViking add_resource/find returned resources but missed expected evidence-term matches for every smoke query." + }, + { + "capability": "context_trajectory", + "status": "blocked", + "evidence": "OpenViking staged/hierarchical retrieval is now encoded as blocked context_trajectory fixtures until same-corpus expected evidence ids match and staged artifacts are materialized." + }, + { + "capability": "real_world_job_adapter", + "status": "not_encoded", + "evidence": "No OpenViking adapter currently executes real_world_job prompts and answer scoring." + } + ], + "suites": [ + { + "suite_id": "retrieval", + "status": "wrong_result", + "evidence": "The Docker-local setup reached add_resource/find, but the retrieval check returned 0/3 expected evidence-term matches." + }, + { + "suite_id": "work_resume", + "status": "not_encoded", + "evidence": "Hierarchical context resume scenarios are not encoded for OpenViking." + }, + { + "suite_id": "context_trajectory", + "status": "blocked", + "evidence": "The staged retrieval, hierarchy selection, and recursive/context expansion fixtures are encoded as blocked behind same-corpus evidence output and staged artifact readback." + } + ], + "scenarios": [], + "evidence": [ + { + "kind": "runner", + "ref": "scripts/live-baseline-benchmark.sh", + "status": "wrong_result" + } + ], + "execution_metadata": { + "sources": [ + { + "label": "OpenViking repository", + "url": "https://github.com/volcengine/OpenViking/", + "evidence": "Official source for OpenViking local context database, resource, and retrieval APIs." + }, + { + "label": "llama-cpp-python CPU wheel index", + "url": "https://abetlen.github.io/llama-cpp-python/whl/cpu", + "evidence": "Official prebuilt CPU wheel index used by the Docker-local embedding pin." + } + ], + "setup_path": "Run ELF_BASELINE_PROJECTS=OpenViking cargo make baseline-live-docker. The runner installs llama-cpp-python==0.3.28 with --only-binary llama-cpp-python from the CPU wheel index before OpenViking add_resource/find.", + "runtime_boundary": "docker-compose.baseline.yml baseline-runner container; no host-global OpenViking, llama-cpp-python, or model service install is required.", + "resource_expectation": "Local embedding setup may download a CPU wheel and model assets; record OpenViking.log, elapsed time, and cache size before claiming adapter quality.", + "retry_guidance": [ + "Use the default pinned CPU wheel path first.", + "Override ELF_BASELINE_OPENVIKING_LLAMA_CPP_PYTHON_VERSION or ELF_BASELINE_OPENVIKING_LLAMA_CPP_PYTHON_INDEX only when the default wheel is unavailable for the Docker platform.", + "Treat install/import failure as incomplete, not wrong_result; treat add_resource/find evidence misses as wrong_result." + ] + }, + "notes": [ + "Record OpenViking as wrong_result now that the pinned Docker local embedding path reaches add_resource/find but misses expected evidence; keep context_trajectory as blocked until staged artifacts exist." + ], + "follow_up": { + "title": "Fix OpenViking evidence-bearing same-corpus retrieval output and materialize staged artifacts", + "reason": "The current adapter reaches add_resource/find and exposes expected evidence ids, but must match evidence ids and return stage/hierarchy/recursive artifacts before trajectory quality can be scored." + } + }, + { + "adapter_id": "claude_mem_live_baseline", + "project": "claude-mem", + "adapter_kind": "docker_repository_same_corpus", + "evidence_class": "live_baseline_only", + "docker_default": true, + "host_global_installs_required": false, + "overall_status": "wrong_result", + "setup": { + "status": "pass", + "evidence": "The live-baseline Docker runner can install and build claude-mem.", + "command": "ELF_BASELINE_PROJECTS=claude-mem cargo make baseline-live-docker", + "artifact": "tmp/live-baseline/claude-mem.log" + }, + "run": { + "status": "wrong_result", + "evidence": "The Docker runner now uses a durable SQLite file, exercises repository update/delete/reopen checks, and reports missed same-corpus or lifecycle evidence as typed non-pass.", + "artifact": "tmp/live-baseline/live-baseline-report.json" + }, + "result": { + "status": "wrong_result", + "evidence": "No real_world_job claude-mem adapter is encoded; progressive disclosure remains a design reference.", + "artifact": "tmp/live-baseline/live-baseline-report.json" + }, + "capabilities": [ + { + "capability": "same_corpus_retrieval", + "status": "wrong_result", + "evidence": "The current Docker adapter did not prove correct same-corpus retrieval." + }, + { + "capability": "durable_storage", + "status": "real", + "evidence": "The runner writes to a Docker-local SQLite file and constructs a new Database plus repository instances for cold-start recovery search." + }, + { + "capability": "repository_lifecycle", + "status": "real", + "evidence": "The runner uses MemoryItemsRepository.update, deletes from the repository-owned memory_items table, and relies on repository FTS triggers for update/delete checks." + }, + { + "capability": "repository_progressive_disclosure", + "status": "real", + "evidence": "The runner verifies search result to getById detail hydration and listSources source evidence on the durable repository path." + }, + { + "capability": "progressive_disclosure_real_world_job", + "status": "pass", + "evidence": "XY-925 adds fixture-backed prompt coverage for the Docker-contained repository progressive-disclosure path: search result to getById detail hydration and listSources evidence on durable SQLite. Hook, timeline, and viewer workflows remain blocked separately." + }, + { + "capability": "retrieval_repair_artifact", + "status": "wrong_result", + "evidence": "The same-corpus retrieval smoke remains wrong_result, and XY-925 records a repair prompt that tells operators to rerun ELF_BASELINE_PROJECTS=claude-mem cargo make baseline-live-docker before inspecting tmp/live-baseline/claude-mem.log and tmp/live-baseline/claude-mem-checks.json." + }, + { + "capability": "hook_capture_viewer_workflow", + "status": "blocked", + "evidence": "The current Docker runner does not launch claude-mem hooks, timeline capture, local viewer readback, or an operator workflow over the same corpus." + } + ], + "suites": [ + { + "suite_id": "work_resume", + "status": "not_encoded", + "evidence": "The durable repository run is encoded, but hook-driven capture and real_world_job work-resume prompts are not proven by that local repository check." + }, + { + "suite_id": "operator_debugging_ux", + "status": "blocked", + "evidence": "XY-925 adds fixture-backed progressive-disclosure and retrieval-repair prompt coverage, but local viewer/operator workflow remains blocked until a Docker-contained viewer or equivalent readback runner exists." + }, + { + "suite_id": "capture_integration", + "status": "blocked", + "evidence": "claude-mem hook capture remains blocked because hooks, timeline capture, and observation workflows are not executed by this runner." + } + ], + "scenarios": [ + { + "scenario_id": "same_corpus_retrieval", + "suite_id": "retrieval", + "status": "wrong_result", + "elf_position": "wins", + "comparison_outcome": "win", + "evidence": "Fresh comparable baseline run live-baseline-20260611061612 reports ELF retrieval_pass and claude-mem same_corpus_retrieval as wrong_result with 0/3 expected query checks passing, while its durable repository setup completed. This is an ELF baseline win for the narrow retrieval smoke scenario.", + "command": "ELF_BASELINE_PROJECTS=ELF,agentmemory,mem0,memsearch,claude-mem cargo make baseline-live-docker", + "artifact": "tmp/live-baseline/live-baseline-report.json" + }, + { + "scenario_id": "retrieval_repair_artifact_path", + "suite_id": "retrieval", + "status": "wrong_result", + "elf_position": "wins", + "comparison_outcome": "win", + "evidence": "XY-925 adds a checked-in repair prompt that preserves the claude-mem wrong_result and names rerun/inspection targets from the reproducible Docker baseline: tmp/live-baseline/claude-mem.log and tmp/live-baseline/claude-mem-checks.json. This is repair evidence for a miss, not a retrieval pass.", + "command": "cargo make real-world-first-generation-oss", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/first_generation_oss/claude_mem_retrieval_repair.json" + }, + { + "scenario_id": "repository_lifecycle_reload", + "suite_id": "memory_evolution", + "status": "pass", + "elf_position": "ties", + "comparison_outcome": "tie", + "evidence": "Fresh comparable baseline run live-baseline-20260611061612 reports ELF passing local lifecycle checks and claude-mem update, delete, and cold-start reload checks passing over a durable Docker-local SQLite repository. This is a local lifecycle-smoke tie, not a hook-driven work-resume or full progressive-disclosure job pass.", + "command": "ELF_BASELINE_PROJECTS=ELF,agentmemory,mem0,memsearch,claude-mem cargo make baseline-live-docker", + "artifact": "tmp/live-baseline/live-baseline-report.json" + }, + { + "scenario_id": "progressive_disclosure_detail_hydration", + "suite_id": "operator_debugging_ux", + "status": "pass", + "elf_position": "untested", + "comparison_outcome": "not_tested", + "evidence": "claude-mem passed the repository-level search-to-detail/source hydration check, which is a useful progressive-disclosure signal. ELF does not have a directly comparable claude-mem-style progressive-disclosure scenario in this baseline, so the ELF position remains untested rather than a loss claim.", + "command": "ELF_BASELINE_PROJECTS=ELF,agentmemory,mem0,memsearch,claude-mem cargo make baseline-live-docker", + "artifact": "tmp/live-baseline/live-baseline-report.json" + }, + { + "scenario_id": "progressive_disclosure_prompt", + "suite_id": "operator_debugging_ux", + "status": "pass", + "elf_position": "untested", + "comparison_outcome": "not_tested", + "evidence": "XY-925 adds fixture-backed prompt coverage that asks for the measured claude-mem progressive-disclosure boundary: repository search results hydrate through getById and listSources on durable SQLite, but hooks, timeline, viewer, and live prompt scoring are not executed.", + "command": "cargo make real-world-first-generation-oss", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/first_generation_oss/claude_mem_progressive_disclosure.json" + }, + { + "scenario_id": "hook_capture_viewer_workflow", + "suite_id": "capture_integration", + "status": "blocked", + "elf_position": "untested", + "comparison_outcome": "blocked", + "evidence": "The Docker baseline uses repository classes only. claude-mem hooks, viewer, timeline, and observation workflows are not executed by the runner, so XY-925 preserves this as a typed blocker rather than not_encoded prose.", + "command": "cargo make real-world-first-generation-oss", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/first_generation_oss/claude_mem_hook_viewer_blocked.json" + }, + { + "scenario_id": "viewer_operator_workflow", + "suite_id": "operator_debugging_ux", + "status": "blocked", + "elf_position": "untested", + "comparison_outcome": "blocked", + "evidence": "A fair claude-mem viewer/operator comparison needs a Docker-contained run that opens the local viewer or equivalent readback over the same durable SQLite corpus and emits timeline, detail hydration, and repair-command artifacts. That path is not available in the current runner.", + "command": "cargo make real-world-first-generation-oss", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/first_generation_oss/claude_mem_hook_viewer_blocked.json" + } + ], + "evidence": [ + { + "kind": "runner", + "ref": "scripts/live-baseline-benchmark.sh", + "status": "real" + } + ], + "notes": [ + "claude-mem remains a UX reference; durable repository checks do not prove hook, viewer, or full real-world progressive-disclosure behavior." + ] + }, + { + "adapter_id": "qmd_deep_profile_gate", + "project": "qmd", + "adapter_kind": "docker_cli_deep_profile_gate", + "evidence_class": "research_gate", + "docker_default": true, + "host_global_installs_required": false, + "overall_status": "not_encoded", + "setup": { + "status": "pass", + "evidence": "qmd already has a Docker CLI live-baseline adapter; this gate records the deeper profile extension before a separate scaled run is claimed.", + "command": "ELF_BASELINE_PROJECTS=qmd ELF_BASELINE_PROFILE=stress cargo make baseline-live-docker", + "artifact": "tmp/live-baseline/qmd.log" + }, + "run": { + "status": "not_encoded", + "evidence": "The XY-899 strength-profile report is checked in, but no new live qmd deep-profile adapter artifact is claimed from it." + }, + "result": { + "status": "not_encoded", + "evidence": "The XY-899 report records qmd scenario-level retrieval/debug/replay outcomes and wrong-result diagnosis taxonomy, while expansion/fusion/rerank scoring remains not_encoded.", + "artifact": "docs/evidence/benchmarking/2026-06-11-qmd-openviking-strength-profile-report.md" + }, + "capabilities": [ + { + "capability": "stress_profile_retrieval_debug", + "status": "not_encoded", + "evidence": "The stress command path exists, but this adapter-pack gate has not published a deep qmd profile result." + }, + { + "capability": "real_world_job_adapter", + "status": "not_encoded", + "evidence": "The qmd live real-world sweep covers the current encoded fixture corpus; expanded retrieval-debug strength suites still need their own materialized adapter run." + }, + { + "capability": "host_global_install_boundary", + "status": "unsupported", + "evidence": "Repository-supported qmd benchmark runs must stay inside docker-compose.baseline.yml and must not require host-global installs." + } + ], + "suites": [ + { + "suite_id": "retrieval", + "status": "not_encoded", + "evidence": "A deeper stress retrieval-debug report is not checked in for this gate." + }, + { + "suite_id": "operator_debugging_ux", + "status": "not_encoded", + "evidence": "qmd query planning and score readback are not yet scored as operator-debugging real_world_job outputs." + } + ], + "scenarios": [], + "evidence": [ + { + "kind": "source", + "ref": "https://github.com/tobi/qmd", + "status": "real" + }, + { + "kind": "runner", + "ref": "scripts/live-baseline-benchmark.sh", + "status": "real" + } + ], + "execution_metadata": { + "sources": [ + { + "label": "qmd repository", + "url": "https://github.com/tobi/qmd", + "evidence": "Official qmd source for local hybrid search, CLI setup, and query behavior." + } + ], + "setup_path": "Use the existing Docker baseline qmd install, collection add, update, embed, and query flow with scale or stress profiles.", + "runtime_boundary": "docker-compose.baseline.yml baseline-runner container with project files and caches inside Docker volumes.", + "resource_expectation": "CPU local embedding and rerank cost scale with corpus size; record elapsed time and qmd log artifacts before claims.", + "retry_guidance": [ + "Run qmd stress profile in Docker and publish the artifact path.", + "Map qmd JSON output to retrieval-debug real_world_job scoring before suite claims." + ], + "research_depth": "D2 reviewed; deep profile not encoded" + }, + "notes": [ + "This gate deepens qmd planning without changing the existing qmd pass evidence from the smoke live baseline." + ] + }, + { + "adapter_id": "openviking_deep_profile_gate", + "project": "OpenViking", + "adapter_kind": "docker_local_embed_context_trajectory_gate", + "evidence_class": "research_gate", + "docker_default": true, + "host_global_installs_required": false, + "overall_status": "blocked", + "setup": { + "status": "pass", + "evidence": "The default pinned OpenViking local embedding dependency path reaches runtime in Docker.", + "command": "ELF_BASELINE_PROJECTS=OpenViking cargo make baseline-live-docker", + "artifact": "tmp/live-baseline/OpenViking.log" + }, + "run": { + "status": "blocked", + "evidence": "The XY-928 context_trajectory fixtures encode staged retrieval, hierarchy selection, and recursive/context expansion as blocked; no live trajectory adapter artifact is claimed." + }, + "result": { + "status": "blocked", + "evidence": "No OpenViking deep context-trajectory result is claimed from the current wrong-result smoke run; the XY-928 fixtures preserve trajectory surfaces as blocked/not_tested.", + "artifact": "docs/evidence/benchmarking/2026-06-11-qmd-openviking-strength-profile-report.md" + }, + "capabilities": [ + { + "capability": "docker_local_embed_setup", + "status": "pass", + "evidence": "The local embedding setup is pinned and reaches import/runtime in Docker." + }, + { + "capability": "hierarchical_context_trajectory", + "status": "blocked", + "evidence": "Stage trajectory scoring is encoded as blocked until the smoke adapter returns evidence-bearing same-corpus output and selected hierarchy/expansion artifacts." + }, + { + "capability": "host_global_install_boundary", + "status": "unsupported", + "evidence": "The adapter pack must not ask operators to install OpenViking dependencies globally on the host." + } + ], + "suites": [ + { + "suite_id": "retrieval", + "status": "wrong_result", + "evidence": "Same-corpus retrieval is still the precondition and remains wrong_result in the live baseline." + }, + { + "suite_id": "context_trajectory", + "status": "blocked", + "evidence": "OpenViking staged retrieval, hierarchy selection, and recursive/context expansion jobs are encoded as blocked fixtures." + }, + { + "suite_id": "operator_debugging_ux", + "status": "not_encoded", + "evidence": "Trajectory readback is a reference feature but not a scored adapter output." + } + ], + "scenarios": [], + "evidence": [ + { + "kind": "source", + "ref": "https://github.com/volcengine/OpenViking/", + "status": "real" + }, + { + "kind": "runner", + "ref": "scripts/live-baseline-benchmark.sh", + "status": "wrong_result" + } + ], + "execution_metadata": { + "sources": [ + { + "label": "OpenViking repository", + "url": "https://github.com/volcengine/OpenViking/", + "evidence": "Official source for OpenViking local context database, resource, and retrieval APIs." + } + ], + "setup_path": "Use the pinned Docker local embedding path from scripts/live-baseline-benchmark.sh, then run OpenViking add_resource/find before any deep profile scoring.", + "runtime_boundary": "docker-compose.baseline.yml baseline-runner container; no host model or compiler setup outside Docker.", + "resource_expectation": "Local embedding setup can download CPU wheels and model assets; record build/import logs, model cache size, and elapsed time.", + "retry_guidance": [ + "Run the default pinned llama-cpp-python==0.3.28 CPU wheel path first.", + "Override the OpenViking llama-cpp-python version or index only when the default wheel is unavailable for the Docker platform.", + "Fix evidence-bearing same-corpus output and materialize selected hierarchy/expansion artifacts before converting blocked context_trajectory fixtures into scored jobs." + ], + "research_depth": "D2 reviewed; local embedding setup pinned; blocked fixtures encoded" + }, + "notes": [ + "OpenViking remains a context-trajectory reference, but this gate prevents a smoke wrong_result or blocked fixture from becoming a deep-profile win claim." + ] + }, + { + "adapter_id": "ragflow_research_gate", + "project": "RAGFlow", + "adapter_kind": "research_gate", + "evidence_class": "research_gate", + "docker_default": true, + "host_global_installs_required": false, + "overall_status": "blocked", + "setup": { + "status": "blocked", + "evidence": "XY-900 promotes the Docker-safe tiny-corpus evidence smoke into a generated real_world_job report while the checked-in row remains smoke-only research_gate evidence.", + "command": "cargo make smoke-ragflow-docker", + "artifact": "tmp/real-world-memory/ragflow-smoke/ragflow-smoke.json" + }, + "run": { + "status": "blocked", + "evidence": "The live path requires explicit resource-envelope opt-in and a local self-hosted RAGFlow API key; setup failures stay typed in the generated smoke artifact.", + "command": "ELF_RAGFLOW_SMOKE_START=1 ELF_RAGFLOW_SMOKE_ACCEPT_RESOURCE_ENVELOPE=1 cargo make smoke-ragflow-docker", + "artifact": "tmp/real-world-memory/ragflow-smoke/memory_projects_manifest.ragflow-smoke.json" + }, + "result": { + "status": "blocked", + "evidence": "The smoke now emits ragflow-report.json and ragflow-report.md from one generated retrieval job. Pass or wrong_result is allowed only when returned reference chunks map to generated evidence ids; resource, setup, and API-key limits remain typed blockers.", + "artifact": "tmp/real-world-memory/ragflow-smoke/ragflow-report.json" + }, + "capabilities": [ + { + "capability": "adapter_candidate_verdict", + "status": "not_encoded", + "evidence": "XY-882 completed D1/D2 feasibility research and marks RAGFlow adapter_candidate; no adapter run is encoded." + }, + { + "capability": "docker_service_setup", + "status": "blocked", + "evidence": "The smoke records official Docker setup, image/disk/startup envelope, CPU/GPU mode, vm.max_map_count handling, provider boundaries, and retry behavior." + }, + { + "capability": "real_world_job_adapter", + "status": "blocked", + "evidence": "One generated retrieval job is scored from the smoke artifact or typed blocked when resource, service, or local API-key boundaries stop execution." + }, + { + "capability": "quality_or_scale_claim", + "status": "not_encoded", + "evidence": "The scored smoke does not claim broad RAGFlow quality, private corpus behavior, scale, or comparative ranking." + } + ], + "suites": [ + { + "suite_id": "retrieval", + "status": "blocked", + "evidence": "The generated retrieval smoke is scored as pass, wrong_result, blocked, or incomplete by ragflow-report.json; the checked-in row remains blocked until live reference chunks map to evidence ids." + }, + { + "suite_id": "knowledge_compilation", + "status": "not_encoded", + "evidence": "RAGFlow knowledge output is not mapped to real_world_job page or citation scoring." + }, + { + "suite_id": "production_ops", + "status": "blocked", + "evidence": "Resource envelope and service startup retry guidance must be documented first." + } + ], + "scenarios": [ + { + "scenario_id": "reference_chunk_citation_mapping", + "suite_id": "retrieval", + "status": "blocked", + "elf_position": "untested", + "comparison_outcome": "blocked", + "evidence": "XY-929 adds a representative blocked fixture for RAGFlow reference-chunk citation scoring. The job must remain blocked until returned reference chunks include generated document ids, chunk ids, content, and document metadata mapped to benchmark evidence ids.", + "command": "cargo make real-world-memory-graph-rag", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/graph_rag/ragflow_reference_chunks_blocked.json" + }, + { + "scenario_id": "private_or_large_corpus_ragflow_quality", + "suite_id": "retrieval", + "status": "not_encoded", + "elf_position": "untested", + "comparison_outcome": "non_goal", + "evidence": "Private corpus, large-corpus, and hosted RAGFlow quality are outside the generated-public Docker representative lane and must not be inferred from smoke reports.", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json" + } + ], + "evidence": [ + { + "kind": "source", + "ref": "https://github.com/infiniflow/ragflow", + "status": "real" + }, + { + "kind": "source", + "ref": "https://ragflow.io/docs/", + "status": "real" + }, + { + "kind": "artifact", + "ref": "tmp/real-world-memory/ragflow-smoke/ragflow-report.json", + "status": "blocked" + }, + { + "kind": "artifact", + "ref": "tmp/real-world-memory/ragflow-smoke/ragflow-report.md", + "status": "blocked" + } + ], + "execution_metadata": { + "sources": [ + { + "label": "RAGFlow repository", + "url": "https://github.com/infiniflow/ragflow", + "evidence": "Official source for RAGFlow service code and Docker Compose setup." + }, + { + "label": "RAGFlow docs", + "url": "https://ragflow.io/docs/", + "evidence": "Official deployment and setup documentation." + }, + { + "label": "RAGFlow HTTP API reference", + "url": "https://raw.githubusercontent.com/infiniflow/ragflow/main/docs/references/http_api_reference.md", + "evidence": "Official reference for OpenAI-compatible responses with reference chunks and document metadata." + } + ], + "setup_path": "Implement a tiny Docker evidence-smoke runner using the official Docker deployment, dataset ingest API, and OpenAI-compatible query API.", + "runtime_boundary": "Run scripts/ragflow-docker-evidence-smoke.sh through cargo make; the live path uses the official RAGFlow Docker Compose service boundary without host-global RAGFlow installs.", + "resource_expectation": "Large multi-service RAG stack; generated artifacts record CPU/GPU mode, memory, disk, image size, expanded disk notes, startup time, vm.max_map_count handling, and provider boundaries before scoring.", + "retry_guidance": [ + "Run cargo make smoke-ragflow-docker first to produce a typed preflight artifact.", + "Start the live path only with ELF_RAGFLOW_SMOKE_START=1 and ELF_RAGFLOW_SMOKE_ACCEPT_RESOURCE_ENVELOPE=1.", + "Keep private corpora and operator-owned provider credentials out of this smoke; map only generated public corpus reference chunks to evidence ids." + ], + "research_depth": "D2 feasibility verdict plus XY-885 evidence-smoke implementation and XY-900 scored smoke promotion; checked-in record remains research_gate unless a generated artifact reaches query output" + }, + "notes": [ + "Status class: smoke-only scored adapter path with typed resource/setup/API-key blockers.", + "Do not interpret ragflow-report.json as broad RAGFlow quality evidence unless reference chunks map to generated evidence ids." + ], + "follow_up": { + "title": "[ELF benchmark adapter] Implement RAGFlow Docker evidence-smoke adapter", + "reason": "Created as XY-885. XY-882 found a Docker boundary and reference-chunk output contract; implementation must prove a tiny ingest/query run before any quality claim." + } + }, + { + "adapter_id": "lightrag_research_gate", + "project": "LightRAG", + "adapter_kind": "research_gate", + "evidence_class": "research_gate", + "docker_default": true, + "host_global_installs_required": false, + "overall_status": "blocked", + "setup": { + "status": "blocked", + "evidence": "XY-886 adds a Docker-profile context-export smoke command, and XY-900 keeps its generated retrieval fixtures scored through real_world_job_benchmark. The checked-in row remains smoke-only research_gate evidence.", + "command": "cargo make smoke-lightrag-docker-context", + "artifact": "tmp/real-world-memory/lightrag-context/lightrag-materialization.json" + }, + "run": { + "status": "blocked", + "evidence": "The default smoke records a typed setup/runtime failure if the LightRAG API is unavailable; set ELF_LIGHTRAG_CONTEXT_START=1 to start the opt-in Docker service profile.", + "command": "ELF_LIGHTRAG_CONTEXT_START=1 cargo make smoke-lightrag-docker-context", + "artifact": "tmp/real-world-memory/lightrag-context/summary.json" + }, + "result": { + "status": "blocked", + "evidence": "The smoke emits lightrag-report.json and lightrag-report.md over generated retrieval jobs. Pass or wrong_result is allowed only when returned context, references, or file paths map to generated evidence ids.", + "artifact": "tmp/real-world-memory/lightrag-context/lightrag-report.json" + }, + "capabilities": [ + { + "capability": "docker_service_setup", + "status": "blocked", + "evidence": "The opt-in compose profile records explicit LightRAG image, LLM, embedding, rerank, workspace, and Docker volume configuration without host-global installs." + }, + { + "capability": "retrieved_context_export", + "status": "blocked", + "evidence": "The materializer calls /documents/texts, waits on /documents/track_status, and queries /query with only_need_context plus chunk references when the service is reachable." + }, + { + "capability": "real_world_job_adapter", + "status": "blocked", + "evidence": "The LightRAG materializer rewrites generated retrieval fixtures with adapter_response evidence only when source paths or context map to required evidence ids." + }, + { + "capability": "quality_or_scale_claim", + "status": "not_encoded", + "evidence": "The smoke does not score broad graph-RAG quality, private corpora, scale, or comparative ranking claims." + } + ], + "suites": [ + { + "suite_id": "retrieval", + "status": "blocked", + "evidence": "The generated smoke can exercise retrieval context/source mapping for retrieval fixtures, but the checked-in record stays blocked until a live artifact reaches query output." + }, + { + "suite_id": "memory_evolution", + "status": "not_encoded", + "evidence": "LightRAG update/delete/current-versus-historical behavior is not encoded by the context-export smoke." + }, + { + "suite_id": "operator_debugging_ux", + "status": "not_encoded", + "evidence": "The smoke records context/source mappings, but full trace or viewer diagnostics are not mapped to benchmark scoring." + } + ], + "scenarios": [ + { + "scenario_id": "context_source_reference_mapping", + "suite_id": "retrieval", + "status": "incomplete", + "elf_position": "untested", + "comparison_outcome": "blocked", + "evidence": "XY-929 adds a representative incomplete fixture for LightRAG context/source-reference scoring. The job cannot score until the opt-in Docker API exports generated source file paths, snippets, or reference content.", + "command": "cargo make real-world-memory-graph-rag", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/graph_rag/lightrag_context_sources_incomplete.json" + }, + { + "scenario_id": "graph_rag_navigation_quality", + "suite_id": "retrieval", + "status": "not_encoded", + "elf_position": "untested", + "comparison_outcome": "not_tested", + "evidence": "LightRAG graph-RAG navigation quality remains not_tested beyond the context-source output contract; no ELF win, tie, or loss is claimed.", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json" + } + ], + "evidence": [ + { + "kind": "source", + "ref": "https://github.com/HKUDS/LightRAG", + "status": "real" + }, + { + "kind": "source", + "ref": "https://github.com/HKUDS/LightRAG/blob/main/docs/DockerDeployment.md", + "status": "real" + }, + { + "kind": "command", + "ref": "cargo make smoke-lightrag-docker-context", + "status": "blocked" + }, + { + "kind": "artifact", + "ref": "tmp/real-world-memory/lightrag-context/lightrag-materialization.json", + "status": "blocked" + }, + { + "kind": "artifact", + "ref": "tmp/real-world-memory/lightrag-context/lightrag-report.md", + "status": "blocked" + } + ], + "execution_metadata": { + "sources": [ + { + "label": "LightRAG repository", + "url": "https://github.com/HKUDS/LightRAG", + "evidence": "Official source for LightRAG server, Docker, and retrieval modes." + }, + { + "label": "LightRAG Docker docs", + "url": "https://github.com/HKUDS/LightRAG/blob/main/docs/DockerDeployment.md", + "evidence": "Official Docker deployment reference." + }, + { + "label": "LightRAG API server docs", + "url": "https://github.com/HKUDS/LightRAG/blob/main/docs/LightRAG-API-Server.md", + "evidence": "Official query-mode and context-output reference." + }, + { + "label": "LightRAG core programming docs", + "url": "https://github.com/HKUDS/LightRAG/blob/main/docs/ProgramingWithCore.md", + "evidence": "Official source-id and file-path citation reference." + } + ], + "setup_path": "Run cargo make smoke-lightrag-docker-context for a typed preflight artifact; set ELF_LIGHTRAG_CONTEXT_START=1 to start the opt-in LightRAG Docker profile and attempt live context export.", + "runtime_boundary": "docker-compose.baseline.yml baseline-runner plus opt-in lightrag and lightrag-mock-provider services; generated source files and LightRAG data stay in Docker-mounted artifact paths and Docker volumes.", + "resource_expectation": "The default profile uses the official LightRAG image, a local OpenAI-compatible mock provider, 64-dimensional embeddings, rerank disabled for context queries, cargo/pip/Hugging Face caches, and Docker volumes for rag_storage, inputs, and prompts.", + "retry_guidance": [ + "Run cargo make smoke-lightrag-docker-context first; a missing API must remain a typed incomplete artifact, not a pass claim.", + "Set ELF_LIGHTRAG_CONTEXT_START=1 only when Docker may pull/start the LightRAG service profile.", + "Score retrieval only when returned context, references.file_path, or references.content map to required evidence ids." + ], + "research_depth": "D2 feasibility plus XY-886 context-export implementation and XY-900 scored smoke aggregation; checked-in record remains research_gate unless a generated artifact reaches query output" + }, + "notes": [ + "Status class: smoke-only scored adapter path with typed service/setup blockers.", + "Do not interpret lightrag-report.json as broad graph-RAG quality evidence unless generated source/context mappings score as pass." + ], + "follow_up": { + "title": "[ELF benchmark adapter] Implement LightRAG Docker context-export adapter", + "reason": "Created as XY-886. XY-882 found a Docker service path and context/source mapping contract; implementation must prove evidence export before scoring." + } + }, + { + "adapter_id": "graphrag_research_gate", + "project": "GraphRAG", + "adapter_kind": "research_gate", + "evidence_class": "research_gate", + "docker_default": true, + "host_global_installs_required": false, + "overall_status": "blocked", + "setup": { + "status": "blocked", + "evidence": "XY-900 promotes the Docker-safe generated-corpus GraphRAG smoke into a scored knowledge_compilation report while the checked-in row remains smoke-only research_gate evidence.", + "command": "cargo make smoke-graphrag-docker", + "artifact": "tmp/real-world-memory/graphrag-smoke/graphrag-smoke.json" + }, + "run": { + "status": "blocked", + "evidence": "The default smoke records a typed blocked artifact without model calls; set ELF_GRAPHRAG_SMOKE_RUN=1 with explicit provider configuration to attempt live GraphRAG index/query.", + "command": "ELF_GRAPHRAG_SMOKE_RUN=1 cargo make smoke-graphrag-docker", + "artifact": "tmp/real-world-memory/graphrag-smoke/summary.json" + }, + "result": { + "status": "blocked", + "evidence": "The smoke now emits graphrag-report.json and graphrag-report.md from one generated knowledge_compilation job. Pass or wrong_result is allowed only when GraphRAG output tables map to generated evidence ids.", + "artifact": "tmp/real-world-memory/graphrag-smoke/graphrag-report.json" + }, + "capabilities": [ + { + "capability": "indexing_resource_envelope", + "status": "blocked", + "evidence": "The smoke bounds the generated public corpus, timeout, GraphRAG package, model configuration, cache size, output size, elapsed time, and observed cache entries." + }, + { + "capability": "source_citation_mapping", + "status": "blocked", + "evidence": "The generated artifact maps GraphRAG documents, text_units, communities, community_reports, entities, and relationships parquet rows back to real_world_job evidence ids when available." + }, + { + "capability": "real_world_job_adapter", + "status": "blocked", + "evidence": "The smoke writes a generated real_world_job fixture and scored report; provider/setup limits remain blocked until live GraphRAG output maps to expected evidence ids." + }, + { + "capability": "quality_or_scale_claim", + "status": "not_encoded", + "evidence": "The smoke does not claim broad graph-navigation quality, knowledge-synthesis quality, private corpora, or large-corpus indexing." + } + ], + "suites": [ + { + "suite_id": "knowledge_compilation", + "status": "blocked", + "evidence": "The generated smoke can exercise parquet table source coverage for one tiny knowledge-compilation fixture, but the checked-in record stays blocked until live output exists." + }, + { + "suite_id": "retrieval", + "status": "not_encoded", + "evidence": "The smoke may run local search for reachability, but retrieval quality scoring is not encoded." + }, + { + "suite_id": "production_ops", + "status": "not_encoded", + "evidence": "Resource bounds are recorded, but no production-ops suite scoring is encoded." + }, + { + "suite_id": "memory_evolution", + "status": "not_encoded", + "evidence": "GraphRAG update/delete/current-versus-historical behavior is not encoded by the smoke." + } + ], + "scenarios": [ + { + "scenario_id": "output_table_citation_mapping", + "suite_id": "knowledge_compilation", + "status": "blocked", + "elf_position": "untested", + "comparison_outcome": "blocked", + "evidence": "XY-929 adds a representative blocked fixture for GraphRAG output-table citation scoring. The job requires provider-backed Docker output tables whose document, text-unit, community, report, entity, and relationship identifiers map to generated evidence ids.", + "command": "cargo make real-world-memory-graph-rag", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/graph_rag/graphrag_output_tables_blocked.json" + }, + { + "scenario_id": "graph_summary_synthesis_quality", + "suite_id": "knowledge_compilation", + "status": "not_encoded", + "elf_position": "untested", + "comparison_outcome": "not_tested", + "evidence": "GraphRAG graph-summary synthesis quality remains not_tested until provider-backed output tables and local-search context are scored beyond the smoke contract.", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json" + } + ], + "evidence": [ + { + "kind": "source", + "ref": "https://github.com/microsoft/graphrag", + "status": "real" + }, + { + "kind": "source", + "ref": "https://microsoft.github.io/graphrag/", + "status": "real" + }, + { + "kind": "command", + "ref": "cargo make smoke-graphrag-docker", + "status": "blocked" + }, + { + "kind": "artifact", + "ref": "tmp/real-world-memory/graphrag-smoke/graphrag-smoke.json", + "status": "blocked" + }, + { + "kind": "artifact", + "ref": "tmp/real-world-memory/graphrag-smoke/graphrag-report.md", + "status": "blocked" + } + ], + "execution_metadata": { + "sources": [ + { + "label": "GraphRAG repository", + "url": "https://github.com/microsoft/graphrag", + "evidence": "Official Microsoft GraphRAG source and setup reference." + }, + { + "label": "GraphRAG docs", + "url": "https://microsoft.github.io/graphrag/", + "evidence": "Official documentation for indexing and querying." + }, + { + "label": "GraphRAG input docs", + "url": "https://microsoft.github.io/graphrag/index/inputs/", + "evidence": "Official input format and document metadata reference." + }, + { + "label": "GraphRAG output tables", + "url": "https://microsoft.github.io/graphrag/index/outputs/", + "evidence": "Official output schema with document, text unit, community, and relationship identifiers." + }, + { + "label": "GraphRAG local search docs", + "url": "https://microsoft.github.io/graphrag/query/local_search/", + "evidence": "Official local-search context and graph traversal reference." + } + ], + "setup_path": "Run cargo make smoke-graphrag-docker for a typed preflight artifact; set ELF_GRAPHRAG_SMOKE_RUN=1 with explicit provider configuration for a live GraphRAG index/query attempt.", + "runtime_boundary": "docker-compose.baseline.yml baseline-runner, container-local Python venv, generated public corpus, and report artifacts under tmp/real-world-memory/graphrag-smoke.", + "resource_expectation": "The default profile uses a generated public corpus capped by ELF_GRAPHRAG_MAX_DOCS and ELF_GRAPHRAG_MAX_INPUT_CHARS, pins GraphRAG through ELF_GRAPHRAG_PACKAGE, and records elapsed time, cache size, output size, and observed cache entries.", + "retry_guidance": [ + "Run cargo make smoke-graphrag-docker first; missing provider configuration must remain a typed blocked artifact, not a pass claim.", + "Enable ELF_GRAPHRAG_SMOKE_RUN=1 only for generated public corpus indexing with explicit provider configuration.", + "Fail typed if source document or text_unit identifiers cannot be mapped to expected evidence IDs." + ], + "research_depth": "D2 feasibility plus XY-887 Docker smoke implementation and XY-900 scored smoke promotion; checked-in record remains research_gate unless a generated artifact reaches GraphRAG output" + }, + "notes": [ + "Status class: smoke-only scored adapter path with typed provider/setup blockers.", + "Do not interpret graphrag-report.json as broad graph-navigation or knowledge-synthesis quality evidence unless output tables map to generated evidence ids." + ], + "follow_up": { + "title": "[ELF benchmark adapter] Implement GraphRAG cost-bounded Docker adapter", + "reason": "Created as XY-887. XY-882 found a Docker-bounded CLI/API path and output-table evidence handles; implementation must stay tiny and cost-recorded." + } + }, + { + "adapter_id": "graphiti_zep_research_gate", + "project": "Graphiti/Zep", + "adapter_kind": "research_gate", + "evidence_class": "research_gate", + "docker_default": true, + "host_global_installs_required": false, + "overall_status": "blocked", + "setup": { + "status": "blocked", + "evidence": "XY-900 promotes the Docker-contained Graphiti/Zep temporal smoke into a scored memory_evolution report while the checked-in row remains smoke-only research_gate evidence.", + "command": "cargo make smoke-graphiti-zep-docker-temporal", + "artifact": "tmp/real-world-memory/graphiti-zep-smoke/graphiti-zep-smoke.json" + }, + "run": { + "status": "blocked", + "evidence": "The default smoke records a typed setup/runtime failure if live execution is not explicitly enabled. Set ELF_GRAPHITI_ZEP_SMOKE_START=1 and ELF_GRAPHITI_ZEP_SMOKE_RUN=1 with explicit provider configuration to start Docker-local FalkorDB and run Graphiti.", + "command": "ELF_GRAPHITI_ZEP_SMOKE_START=1 ELF_GRAPHITI_ZEP_SMOKE_RUN=1 cargo make smoke-graphiti-zep-docker-temporal", + "artifact": "tmp/real-world-memory/graphiti-zep-smoke/summary.json" + }, + "result": { + "status": "blocked", + "evidence": "The smoke now emits graphiti-zep-report.json and graphiti-zep-report.md from one generated memory_evolution job. The default blocker is live-run opt-in disabled; when ELF_GRAPHITI_ZEP_SMOKE_START=1 and ELF_GRAPHITI_ZEP_SMOKE_RUN=1 are set without provider credentials, the blocker is provider_api_key_missing. No hosted Zep service or unrecorded credentials are used.", + "artifact": "tmp/real-world-memory/graphiti-zep-smoke/graphiti-zep-report.json" + }, + "capabilities": [ + { + "capability": "temporal_graph_memory", + "status": "blocked", + "evidence": "The smoke materializes generated current, historical, and rationale facts with validity windows, but the checked-in record stays blocked until a live artifact maps search output." + }, + { + "capability": "docker_graph_store_setup", + "status": "blocked", + "evidence": "The task uses a Docker Compose graphiti-zep profile for FalkorDB and a container-local Python venv; no host-global graph database or hosted Zep service is used." + }, + { + "capability": "real_world_job_adapter", + "status": "blocked", + "evidence": "The generated temporal-validity fixture is scored or typed blocked; live quality evidence requires Graphiti/Zep search output mapped to current and historical evidence ids." + }, + { + "capability": "quality_or_scale_claim", + "status": "not_encoded", + "evidence": "The smoke does not claim broad graph-memory quality, managed Zep service behavior, private-corpus behavior, or large-corpus performance." + } + ], + "suites": [ + { + "suite_id": "memory_evolution", + "status": "blocked", + "evidence": "Generated current/historical relation facts are encoded, but the checked-in manifest stays blocked until the Docker smoke returns validity-window search output." + }, + { + "suite_id": "retrieval", + "status": "not_encoded", + "evidence": "Hybrid graph retrieval reachability is not scored beyond the temporal search smoke." + }, + { + "suite_id": "production_ops", + "status": "not_encoded", + "evidence": "The smoke records setup and provider boundaries but does not encode backup, restore, private corpus, or hosted-service operations." + } + ], + "scenarios": [ + { + "scenario_id": "temporal_validity_window_mapping", + "suite_id": "memory_evolution", + "status": "blocked", + "elf_position": "untested", + "comparison_outcome": "blocked", + "evidence": "XY-929 adds a representative blocked fixture for Graphiti/Zep temporal-validity scoring. The job remains blocked until provider-backed Docker output maps current and historical validity-window facts to generated evidence ids.", + "command": "cargo make real-world-memory-graph-rag", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/graph_rag/graphiti_temporal_validity_blocked.json" + }, + { + "scenario_id": "hosted_zep_temporal_memory", + "suite_id": "memory_evolution", + "status": "unsupported", + "elf_position": "untested", + "comparison_outcome": "non_goal", + "evidence": "Hosted Zep service behavior is outside the Docker-local representative lane; no hosted-service result is used as ELF win/loss evidence.", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json" + } + ], + "evidence": [ + { + "kind": "source", + "ref": "https://github.com/getzep/graphiti", + "status": "real" + }, + { + "kind": "source", + "ref": "https://www.getzep.com/platform/graphiti/", + "status": "real" + }, + { + "kind": "command", + "ref": "cargo make smoke-graphiti-zep-docker-temporal", + "status": "blocked" + }, + { + "kind": "artifact", + "ref": "tmp/real-world-memory/graphiti-zep-smoke/graphiti-zep-smoke.json", + "status": "blocked" + }, + { + "kind": "artifact", + "ref": "tmp/real-world-memory/graphiti-zep-smoke/graphiti-zep-report.md", + "status": "blocked" + } + ], + "execution_metadata": { + "sources": [ + { + "label": "Graphiti repository", + "url": "https://github.com/getzep/graphiti", + "evidence": "Official open-source temporal context graph engine." + }, + { + "label": "Zep Graphiti overview", + "url": "https://www.getzep.com/platform/graphiti/", + "evidence": "Official product documentation for temporal context graph behavior." + }, + { + "label": "Graphiti quick start", + "url": "https://help.getzep.com/graphiti/getting-started/quick-start", + "evidence": "Official setup, episode ingest, and search output reference." + }, + { + "label": "Graphiti FalkorDB configuration", + "url": "https://help.getzep.com/graphiti/configuration/falkor-db-configuration", + "evidence": "Official Docker-local FalkorDB setup reference." + }, + { + "label": "Graphiti fact triples", + "url": "https://help.getzep.com/graphiti/working-with-data/adding-fact-triples", + "evidence": "Official manual fact-triple ingest contract." + } + ], + "setup_path": "Run cargo make smoke-graphiti-zep-docker-temporal for a typed artifact; set ELF_GRAPHITI_ZEP_SMOKE_START=1 ELF_GRAPHITI_ZEP_SMOKE_RUN=1 with explicit provider configuration for a live attempt.", + "runtime_boundary": "docker-compose.baseline.yml baseline-runner plus graphiti-zep FalkorDB profile, container-local Python venv, generated public temporal facts, and report artifacts under tmp/real-world-memory/graphiti-zep-smoke.", + "resource_expectation": "Requires Docker-local FalkorDB plus LLM/embedding configuration; generated artifacts record service startup, storage size, provider boundaries, fact count, and timeout before scoring.", + "retry_guidance": [ + "Run cargo make smoke-graphiti-zep-docker-temporal first to produce a typed blocked artifact.", + "Start the live path only with ELF_GRAPHITI_ZEP_SMOKE_START=1, ELF_GRAPHITI_ZEP_SMOKE_RUN=1, and explicit provider configuration.", + "Treat missing validity windows or unmapped current/historical facts as wrong_result, not pass." + ], + "research_depth": "D2 feasibility plus XY-888 Docker temporal smoke implementation and XY-900 scored smoke promotion; checked-in record remains research_gate unless a generated artifact reaches Graphiti search output" + }, + "notes": [ + "Status class: smoke-only scored adapter path with typed live-run opt-in, provider, and setup blockers.", + "Graphiti/Zep remains the temporal-validity reference; do not claim ELF-over-Graphiti/Zep until provider-backed temporal output maps to scored evidence ids." + ], + "follow_up": { + "title": "[ELF benchmark adapter] Implement Graphiti/Zep temporal graph adapter", + "reason": "Created as XY-888. XY-882 found a Docker-local graph-store path and fact/validity-window output contract for memory_evolution scoring." + } + }, + { + "adapter_id": "letta_research_gate", + "project": "Letta", + "adapter_kind": "research_gate", + "evidence_class": "research_gate", + "docker_default": true, + "host_global_installs_required": false, + "overall_status": "blocked", + "setup": { + "status": "blocked", + "evidence": "Letta is D1 reviewed as a core/archival memory reference. The contained comparison contract now has cargo make smoke-letta-core-archive-export-readback, a Docker-only benchmark-created agent export/readback materializer that must return core block JSON, archival search/readback JSON, and source ids before any scenario claim is scored.", + "command": "cargo make smoke-letta-core-archive-export-readback", + "artifact": "tmp/real-world-memory/letta-core-archive/letta-core-archive-export.json" + }, + "run": { + "status": "blocked", + "evidence": "The default materializer emits a typed blocked report unless a Docker-local Letta server and explicit model/provider configuration produce benchmark-owned core block export and archival readback/search output.", + "command": "ELF_LETTA_SMOKE_START=1 ELF_LETTA_SMOKE_RUN=1 cargo make smoke-letta-core-archive-export-readback", + "artifact": "tmp/real-world-memory/letta-core-archive/summary.json" + }, + "result": { + "status": "blocked", + "evidence": "No Letta core block, archival fallback, stale-core, scope, provenance, or project-decision pass/win/tie/loss is claimed until the generated export/readback artifact maps required source ids.", + "artifact": "tmp/real-world-memory/letta-core-archive/report.json" + }, + "capabilities": [ + { + "capability": "core_archival_memory", + "status": "blocked", + "evidence": "ELF fixture jobs score core block attachment, scope, provenance, stale-core detection, archival fallback, and project-decision recovery separately from archival note search; Letta remains blocked until its export maps equivalent source ids." + }, + { + "capability": "docker_embedding_configuration", + "status": "blocked", + "evidence": "Official Docker setup requires explicit embedding configuration before archival retrieval can be tested." + }, + { + "capability": "real_world_job_adapter", + "status": "blocked", + "evidence": "A Docker-contained materializer now exists and emits typed blocked evidence by default; live scoring still requires exported Letta core blocks, archival list/search JSON, and source-id mappings." + }, + { + "capability": "broad_letta_quality_claim", + "status": "not_encoded", + "evidence": "The materializer does not score broad Letta product quality, hosted/private state, personalization breadth, or production durability." + } + ], + "suites": [ + { + "suite_id": "personalization", + "status": "not_encoded", + "evidence": "Core memory preference application is not encoded for Letta." + }, + { + "suite_id": "project_decisions", + "status": "blocked", + "evidence": "The project-decision recovery row is represented only through the core_archival_memory export/readback materializer and remains blocked without mapped source ids." + }, + { + "suite_id": "work_resume", + "status": "not_encoded", + "evidence": "Agent resumption through Letta memory blocks is not encoded." + }, + { + "suite_id": "core_archival_memory", + "status": "blocked", + "evidence": "A Docker-contained materializer now emits the core_archival_memory scenarios as typed blocked unless live Letta export/readback maps core block JSON, archival search/readback JSON, and source ids." + } + ], + "scenarios": [ + { + "scenario_id": "core_block_attachment_readback", + "suite_id": "core_archival_memory", + "status": "blocked", + "elf_position": "untested", + "comparison_outcome": "blocked", + "evidence": "ELF fixture core-archival-core-block-attachment-001 scores exact core block attachment and keeps core readback out of Qdrant-backed archival search. Letta remains blocked until the generated export/readback artifact maps this core block attachment source id.", + "command": "cargo make smoke-letta-core-archive-export-readback", + "artifact": "tmp/real-world-memory/letta-core-archive/summary.json" + }, + { + "scenario_id": "core_block_scope_readback", + "suite_id": "core_archival_memory", + "status": "blocked", + "elf_position": "untested", + "comparison_outcome": "blocked", + "evidence": "ELF fixture core-archival-core-block-scope-001 scores read_profile, shared scope, and private-owner boundaries. Letta scope behavior remains blocked until the generated export includes agent, block, visibility metadata, and source ids.", + "command": "cargo make smoke-letta-core-archive-export-readback", + "artifact": "tmp/real-world-memory/letta-core-archive/summary.json" + }, + { + "scenario_id": "core_block_provenance_readback", + "suite_id": "core_archival_memory", + "status": "blocked", + "elf_position": "untested", + "comparison_outcome": "blocked", + "evidence": "ELF fixture core-archival-core-block-provenance-001 scores source_ref and audit_history readback. Letta provenance remains blocked until exported core memory includes stable source ids and audit-equivalent events.", + "command": "cargo make smoke-letta-core-archive-export-readback", + "artifact": "tmp/real-world-memory/letta-core-archive/summary.json" + }, + { + "scenario_id": "stale_core_detection", + "suite_id": "core_archival_memory", + "status": "blocked", + "elf_position": "untested", + "comparison_outcome": "blocked", + "evidence": "ELF fixture core-archival-stale-core-detection-001 scores archival evidence superseding a stale core block. Letta stale-core comparison is blocked until core export and archival readback can be joined by source ids.", + "command": "cargo make smoke-letta-core-archive-export-readback", + "artifact": "tmp/real-world-memory/letta-core-archive/summary.json" + }, + { + "scenario_id": "archival_fallback_readback", + "suite_id": "core_archival_memory", + "status": "blocked", + "elf_position": "untested", + "comparison_outcome": "blocked", + "evidence": "ELF fixture core-archival-archival-fallback-001 scores fallback from insufficient core memory to archival note search. Letta fallback comparison is blocked until archival search output can be exported with source ids.", + "command": "cargo make smoke-letta-core-archive-export-readback", + "artifact": "tmp/real-world-memory/letta-core-archive/summary.json" + }, + { + "scenario_id": "core_archival_project_decision_recovery", + "suite_id": "core_archival_memory", + "status": "blocked", + "elf_position": "untested", + "comparison_outcome": "blocked", + "evidence": "ELF fixture core-archival-project-decision-recovery-001 scores core routing plus archival decision rationale. Letta project-decision recovery remains blocked until the generated export/readback artifact maps core routing plus archival rationale source ids.", + "command": "cargo make smoke-letta-core-archive-export-readback", + "artifact": "tmp/real-world-memory/letta-core-archive/summary.json" + } + ], + "evidence": [ + { + "kind": "artifact", + "ref": "tmp/real-world-memory/letta-core-archive/letta-core-archive-export.json", + "status": "blocked" + }, + { + "kind": "artifact", + "ref": "tmp/real-world-memory/letta-core-archive/summary.json", + "status": "blocked" + }, + { + "kind": "source", + "ref": "https://docs.letta.com/guides/docker", + "status": "real" + }, + { + "kind": "source", + "ref": "https://docs.letta.com/api/python", + "status": "real" + }, + { + "kind": "source", + "ref": "https://docs.letta.com/api/resources/agents/subresources/passages/methods/search", + "status": "real" + } + ], + "execution_metadata": { + "sources": [ + { + "label": "Letta Docker docs", + "url": "https://docs.letta.com/guides/docker", + "evidence": "Official Docker setup and explicit embedding configuration boundary." + }, + { + "label": "Letta Python API", + "url": "https://docs.letta.com/api/python", + "evidence": "Official Python SDK memory block creation and retrieval examples." + }, + { + "label": "Letta archival search API", + "url": "https://docs.letta.com/api/resources/agents/subresources/passages/methods/search", + "evidence": "Official archival-memory search endpoint contract." + } + ], + "setup_path": "Run cargo make smoke-letta-core-archive-export-readback for a typed artifact; set ELF_LETTA_SMOKE_START=1 ELF_LETTA_SMOKE_RUN=1 with explicit model/provider configuration for a live export attempt. The smoke exports core block JSON plus archival search/readback JSON when Letta setup succeeds.", + "runtime_boundary": "docker-compose.baseline.yml baseline-runner plus optional Letta server profile, benchmark-created agent, benchmark-owned fixture corpus, no hosted/private state, and artifacts under tmp/real-world-memory/letta-core-archive.", + "resource_expectation": "Letta Docker server, Python SDK client, explicit model and embedding configuration, exported core memory, archival search output, and provider boundaries must be explicit in the artifact.", + "retry_guidance": [ + "Default command records a typed blocked artifact without model calls.", + "Enable the live path only with Docker-local Letta and explicit provider or local model configuration.", + "Score core-versus-archival scenarios only after core block export and archival list/search output map to fixture evidence ids." + ], + "research_depth": "D1 feasibility verdict: research_only (XY-882); XY-927 selected the contained export/readback contract; XY-984 adds the Docker-contained materializer and keeps the comparison blocked until live export evidence maps source ids." + }, + "notes": [] + }, + { + "adapter_id": "langgraph_research_gate", + "project": "LangGraph", + "adapter_kind": "research_gate", + "evidence_class": "research_gate", + "docker_default": true, + "host_global_installs_required": false, + "overall_status": "not_encoded", + "setup": { + "status": "not_encoded", + "evidence": "LangGraph is D1 reviewed as a replay/checkpoint reference, not a direct memory backend adapter." + }, + "run": { + "status": "not_encoded", + "evidence": "No checkpoint replay real_world_job harness is encoded." + }, + "result": { + "status": "not_encoded", + "evidence": "No production-ops or resume suite result is claimed." + }, + "capabilities": [ + { + "capability": "checkpoint_replay_regression", + "status": "not_encoded", + "evidence": "Replay/fork behavior needs an agent graph harness before scoring." + }, + { + "capability": "standalone_memory_backend", + "status": "unsupported", + "evidence": "LangGraph persistence is an agent-state/checkpoint layer, not a drop-in memory retrieval backend." + }, + { + "capability": "real_world_job_adapter", + "status": "not_encoded", + "evidence": "No LangGraph benchmark materializer exists." + } + ], + "suites": [ + { + "suite_id": "production_ops", + "status": "not_encoded", + "evidence": "Checkpoint recovery and replay regression are not encoded." + }, + { + "suite_id": "work_resume", + "status": "not_encoded", + "evidence": "Resume from checkpoint with memory reads is not encoded." + } + ], + "scenarios": [], + "evidence": [ + { + "kind": "source", + "ref": "https://docs.langchain.com/oss/python/langgraph/persistence", + "status": "real" + } + ], + "execution_metadata": { + "sources": [ + { + "label": "LangGraph persistence docs", + "url": "https://docs.langchain.com/oss/python/langgraph/persistence", + "evidence": "Official documentation for checkpoints, replay, fork, and persistence behavior." + } + ], + "setup_path": "Build a tiny LangGraph agent with a checkpointer and explicit memory read/write steps before scoring.", + "runtime_boundary": "Docker-only Python harness with checkpoint store under the artifact directory.", + "resource_expectation": "Small runtime expected, but LLM calls and side effects must be stubbed or deterministic before replay claims.", + "retry_guidance": [ + "Encode one replay/fork failure recovery job.", + "Keep LangGraph classified as replay reference unless memory retrieval is actually exercised." + ], + "research_depth": "D1 feasibility verdict: research_only (XY-882); replay/checkpoint reference, adapter not encoded" + }, + "notes": [] + }, + { + "adapter_id": "nanograph_research_gate", + "project": "nanograph", + "adapter_kind": "research_gate", + "evidence_class": "research_gate", + "docker_default": true, + "host_global_installs_required": false, + "overall_status": "not_encoded", + "setup": { + "status": "not_encoded", + "evidence": "nanograph is D1 reviewed as typed graph DX, but no Docker adapter is implemented." + }, + "run": { + "status": "not_encoded", + "evidence": "No typed graph schema/query real_world_job run is encoded." + }, + "result": { + "status": "not_encoded", + "evidence": "No graph temporal or retrieval-debug result is claimed." + }, + "capabilities": [ + { + "capability": "typed_graph_schema", + "status": "not_encoded", + "evidence": "Schema-as-code and typed query ergonomics need a benchmark harness." + }, + { + "capability": "memory_backend_comparison", + "status": "unsupported", + "evidence": "nanograph is a graph database reference, not a complete agent memory service." + }, + { + "capability": "real_world_job_adapter", + "status": "not_encoded", + "evidence": "No nanograph materializer exists." + } + ], + "suites": [ + { + "suite_id": "memory_evolution", + "status": "not_encoded", + "evidence": "Typed current/historical fact jobs are not encoded." + }, + { + "suite_id": "retrieval", + "status": "not_encoded", + "evidence": "Typed query explainability is not scored." + } + ], + "scenarios": [], + "evidence": [ + { + "kind": "source", + "ref": "https://github.com/nanograph/nanograph", + "status": "real" + } + ], + "execution_metadata": { + "sources": [ + { + "label": "nanograph repository", + "url": "https://github.com/nanograph/nanograph", + "evidence": "Official source for on-device typed property graph behavior." + } + ], + "setup_path": "Build or install nanograph inside Docker and load a typed graph fixture from generated corpus facts.", + "runtime_boundary": "Docker-only CLI run with graph folder under benchmark artifacts.", + "resource_expectation": "Light local graph runtime expected; record binary build/install time and graph artifact size.", + "retry_guidance": [ + "Define a minimal schema for memory_evolution facts.", + "Score typed query output only if it cites fixture evidence IDs." + ], + "research_depth": "D1 feasibility verdict: research_only (XY-882); typed graph DX reference, adapter not encoded" + }, + "notes": [] + }, + { + "adapter_id": "llm_wiki_research_gate", + "project": "llm-wiki", + "adapter_kind": "research_gate", + "evidence_class": "research_gate", + "docker_default": true, + "host_global_installs_required": false, + "overall_status": "not_encoded", + "setup": { + "status": "not_encoded", + "evidence": "llm-wiki is D1 reviewed as a knowledge-compilation reference, but no plugin or generated-page adapter is implemented." + }, + "run": { + "status": "not_encoded", + "evidence": "No llm-wiki corpus-to-page run is encoded." + }, + "result": { + "status": "not_encoded", + "evidence": "No knowledge page citation or lint result is claimed." + }, + "capabilities": [ + { + "capability": "knowledge_page_compilation", + "status": "not_encoded", + "evidence": "Wiki generation and citation lint are not executed by the runner." + }, + { + "capability": "live_service_runtime", + "status": "unsupported", + "evidence": "llm-wiki is a plugin/workflow reference rather than a service adapter." + }, + { + "capability": "real_world_job_adapter", + "status": "not_encoded", + "evidence": "No page materializer or scorer mapping exists." + } + ], + "suites": [ + { + "suite_id": "knowledge_compilation", + "status": "not_encoded", + "evidence": "Corpus-to-wiki output is not encoded." + }, + { + "suite_id": "work_resume", + "status": "not_encoded", + "evidence": "Resume answers from wiki pages are not encoded." + } + ], + "scenarios": [ + { + "scenario_id": "wiki_page_citation_lint", + "suite_id": "knowledge_compilation", + "status": "not_encoded", + "elf_position": "untested", + "comparison_outcome": "not_tested", + "evidence": "llm-wiki remains a knowledge-workflow reference. No Docker-contained plugin or file-based page materializer emits cited wiki sections for scoring.", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json" + } + ], + "evidence": [ + { + "kind": "source", + "ref": "https://github.com/nvk/llm-wiki", + "status": "real" + } + ], + "execution_metadata": { + "sources": [ + { + "label": "llm-wiki repository", + "url": "https://github.com/nvk/llm-wiki", + "evidence": "Official source for the LLM Wiki plugin and knowledge-base workflow." + } + ], + "setup_path": "Research plugin bootstrap inside a Docker-contained Codex or file-based harness, then materialize page artifacts.", + "runtime_boundary": "Docker-only plugin or fixture materializer; no user-global Codex plugin install.", + "resource_expectation": "LLM generation cost depends on page build; record provider boundary and generated artifact size.", + "retry_guidance": [ + "Prototype a fixture-only page build with explicit citations.", + "Do not score until generated sections can be mapped to evidence IDs." + ], + "research_depth": "D1 feasibility verdict: research_only (XY-882); derived wiki workflow reference, adapter not encoded" + }, + "notes": [] + }, + { + "adapter_id": "gbrain_research_gate", + "project": "gbrain", + "adapter_kind": "research_gate", + "evidence_class": "research_gate", + "docker_default": true, + "host_global_installs_required": false, + "overall_status": "not_encoded", + "setup": { + "status": "not_encoded", + "evidence": "gbrain is D1 reviewed as a compiled-truth and timeline reference, but no Docker adapter is implemented." + }, + "run": { + "status": "not_encoded", + "evidence": "No gbrain brain-repo import or compiled-truth run is encoded." + }, + "result": { + "status": "not_encoded", + "evidence": "No knowledge-synthesis or operator-continuity result is claimed." + }, + "capabilities": [ + { + "capability": "compiled_truth_timeline", + "status": "not_encoded", + "evidence": "Compiled truth plus timeline output is a reference pattern but not scored." + }, + { + "capability": "postgres_backed_brain_repo", + "status": "blocked", + "evidence": "A Docker-local brain repo and Postgres setup path must be proven before execution." + }, + { + "capability": "real_world_job_adapter", + "status": "not_encoded", + "evidence": "No gbrain materializer exists." + } + ], + "suites": [ + { + "suite_id": "knowledge_compilation", + "status": "not_encoded", + "evidence": "Compiled truth and timeline pages are not scored." + }, + { + "suite_id": "operator_debugging_ux", + "status": "not_encoded", + "evidence": "Operator continuity through brain pages is not encoded." + } + ], + "scenarios": [ + { + "scenario_id": "compiled_truth_timeline_export", + "suite_id": "knowledge_compilation", + "status": "blocked", + "elf_position": "untested", + "comparison_outcome": "blocked", + "evidence": "gbrain compiled-truth and timeline scoring remains blocked until a Docker-local brain repository and database setup emits current-truth pages with source timeline evidence.", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json" + } + ], + "evidence": [ + { + "kind": "source", + "ref": "https://github.com/garrytan/gbrain", + "status": "real" + }, + { + "kind": "source", + "ref": "https://github.com/garrytan/gbrain/blob/master/docs/guides/compiled-truth.md", + "status": "real" + } + ], + "execution_metadata": { + "sources": [ + { + "label": "gbrain repository", + "url": "https://github.com/garrytan/gbrain", + "evidence": "Official source for brain repo and retrieval workflow." + }, + { + "label": "compiled truth guide", + "url": "https://github.com/garrytan/gbrain/blob/master/docs/guides/compiled-truth.md", + "evidence": "Official guide for compiled truth plus timeline behavior." + } + ], + "setup_path": "Create a Docker-local brain repo fixture, run import/sync, and export compiled truth plus timeline evidence.", + "runtime_boundary": "Docker-only repository and database state with no operator-owned brain repo.", + "resource_expectation": "Postgres-backed sync and embedding choices must be explicit; record DB size and import time.", + "retry_guidance": [ + "Prototype a tiny brain repo with one current-truth page and timeline.", + "Score only if compiled truth cites the source timeline evidence." + ], + "research_depth": "D1 feasibility verdict: blocked (XY-882); Docker-local brain repo and database path not proven" + }, + "notes": [] + }, + { + "adapter_id": "graphify_docker_smoke", + "project": "graphify", + "adapter_kind": "docker_cli_real_world_job", + "evidence_class": "live_real_world", + "docker_default": true, + "host_global_installs_required": false, + "overall_status": "wrong_result", + "setup": { + "status": "pass", + "evidence": "XY-900 validation reached the Docker-only graph/report smoke setup inside the baseline runner without host-global assistant hooks.", + "command": "cargo make smoke-graphify-docker-graph-report", + "artifact": "tmp/real-world-memory/graphify-smoke/graphify-smoke.json" + }, + "run": { + "status": "pass", + "evidence": "The smoke installed graphify in a container-local venv, ran over a generated public corpus, and produced graph/report/query output for scoring.", + "command": "cargo make smoke-graphify-docker-graph-report", + "artifact": "tmp/real-world-memory/graphify-smoke/summary.json" + }, + "result": { + "status": "wrong_result", + "evidence": "The smoke emits graphify-report.json and graphify-report.md from one generated knowledge_compilation job. The current scored report maps evidence ids but remains wrong_result because the scoring rubric still records a wrong-result signal.", + "artifact": "tmp/real-world-memory/graphify-smoke/graphify-report.json" + }, + "capabilities": [ + { + "capability": "docker_cli_boundary", + "status": "pass", + "evidence": "The smoke uses docker-compose.baseline.yml baseline-runner, a container-local Python venv, and isolated assistant config paths; it does not install host-global assistant hooks." + }, + { + "capability": "graph_report_generation", + "status": "pass", + "evidence": "The smoke captures graphify-out/graph.json, GRAPH_REPORT.md, cache metadata, command logs, build time, graph size, and report size." + }, + { + "capability": "real_world_job_adapter", + "status": "wrong_result", + "evidence": "The smoke writes a generated real_world_job fixture and scored report; current knowledge_compilation scoring is wrong_result, not pass." + }, + { + "capability": "multimodal_code_graph", + "status": "not_encoded", + "evidence": "Multimodal extraction for videos, images, PDFs, or broad codebase understanding is a reference capability but not scored by this smoke." + }, + { + "capability": "quality_or_scale_claim", + "status": "not_encoded", + "evidence": "The smoke does not claim broad graph quality, private corpus behavior, scale, or authoritative memory-store behavior." + } + ], + "suites": [ + { + "suite_id": "knowledge_compilation", + "status": "wrong_result", + "evidence": "The generated smoke exercised graph/report evidence mapping for one generated knowledge-compilation fixture and scored wrong_result with mean_score 0.75." + }, + { + "suite_id": "retrieval", + "status": "blocked", + "evidence": "Graph-guided query output is present only as support for the generated knowledge_compilation smoke; broad retrieval quality scoring remains unclaimed." + }, + { + "suite_id": "work_resume", + "status": "not_encoded", + "evidence": "Resume answers from graph context are not encoded." + } + ], + "scenarios": [ + { + "scenario_id": "graph_report_navigation_lint", + "suite_id": "knowledge_compilation", + "status": "wrong_result", + "elf_position": "untested", + "comparison_outcome": "not_tested", + "evidence": "XY-929 adds a representative graphify fixture that scores graph report navigation, source-location citations, stale-source lint, and unsupported-summary handling as wrong_result because stale-source lint is still missing. This remains graphify non-pass evidence, not an ELF victory claim.", + "command": "cargo make real-world-memory-graph-rag", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/graph_rag/graphify_graph_report_wrong_result.json" + }, + { + "scenario_id": "broad_graph_navigation_quality", + "suite_id": "retrieval", + "status": "not_encoded", + "elf_position": "untested", + "comparison_outcome": "not_tested", + "evidence": "Broad graph-navigation, codebase, multimodal, and private-corpus quality remain not_tested; the graphify evidence is bounded to generated graph/report artifacts.", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json" + } + ], + "evidence": [ + { + "kind": "source", + "ref": "https://github.com/safishamsi/graphify", + "status": "real" + }, + { + "kind": "command", + "ref": "cargo make smoke-graphify-docker-graph-report", + "status": "wrong_result" + }, + { + "kind": "artifact", + "ref": "tmp/real-world-memory/graphify-smoke/graphify-smoke.json", + "status": "pass" + }, + { + "kind": "artifact", + "ref": "tmp/real-world-memory/graphify-smoke/graphify-report.md", + "status": "wrong_result" + } + ], + "execution_metadata": { + "sources": [ + { + "label": "graphify repository", + "url": "https://github.com/safishamsi/graphify", + "evidence": "Official source for graphify graph extraction and query workflow." + }, + { + "label": "graphify README", + "url": "https://github.com/safishamsi/graphify/blob/v3/README.md", + "evidence": "Official CLI, output artifact, query, and source-location contract." + } + ], + "setup_path": "Run cargo make smoke-graphify-docker-graph-report to install graphify in Docker, build graph/report artifacts from a generated public corpus, and export query evidence without installing host-global assistant hooks.", + "runtime_boundary": "docker-compose.baseline.yml baseline-runner, container-local Python venv, isolated HOME/config paths, generated public corpus, and artifacts under tmp/real-world-memory/graphify-smoke.", + "resource_expectation": "Graph build cost scales with corpus and model choices; generated artifacts record package reference, provider/model boundary, build time, graph size, report size, cache size, timeout, and retry behavior.", + "retry_guidance": [ + "Run cargo make smoke-graphify-docker-graph-report first; setup/runtime failures must remain typed artifacts, not pass claims.", + "Do not use graphify host assistant hook installs or operator-owned assistant configuration as proof.", + "Score graph-guided answers only when graph.json, GRAPH_REPORT.md, and graphify query output map to generated evidence ids." + ], + "research_depth": "D1 feasibility verdict plus XY-889 Docker graph/report smoke implementation and XY-900 scored smoke promotion; current Docker validation reaches graphify output and scores the tiny knowledge_compilation job as wrong_result" + }, + "notes": [ + "Status class: live Docker scored smoke with a current wrong_result outcome.", + "Do not interpret graphify-report.json as broad graph-navigation or knowledge-compilation quality evidence; the tiny smoke is scored and currently non-pass." + ], + "follow_up": { + "title": "[ELF benchmark adapter] Implement graphify Docker graph-report adapter", + "reason": "Created as XY-889. XY-882 found a Docker-only CLI/materializer path and source-file/source-location output contract." + } + } + ] + }, + "capture_integration": { + "real": [], + "fixture_backed": [], + "mocked": [], + "blocked": [], + "not_encoded": [ + "No capture/integration behavior was declared by encoded fixtures." + ], + "notes": [] + }, + "summary": { + "job_count": 11, + "encoded_suite_count": 3, + "pass": 9, + "wrong_result": 0, + "lifecycle_fail": 0, + "incomplete": 0, + "blocked": 2, + "not_encoded": 0, + "unsupported_claim": 0, + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 0, + "update_rationale_available_count": 0, + "temporal_validity_not_encoded_count": 0, + "history_readback_encoded_count": 0, + "expected_evidence_total": 22, + "expected_evidence_matched": 22, + "expected_evidence_recall": 1.0, + "irrelevant_context_count": 0, + "irrelevant_context_ratio": 0.0, + "trace_explainability_count": 11, + "wrong_result_stage_attribution_count": 0, + "mean_score": 0.818, + "mean_latency_ms": 9.836, + "total_cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + }, + "evidence_required_count": 22, + "evidence_covered_count": 22, + "evidence_coverage": 1.0, + "source_ref_required_count": 22, + "source_ref_covered_count": 22, + "source_ref_coverage": 1.0, + "quote_required_count": 22, + "quote_covered_count": 22, + "quote_coverage": 1.0, + "stale_retrieval_count": 0, + "scope_check_count": 0, + "scope_correct_count": 0, + "scope_correctness": 0.0, + "scope_violation_count": 0, + "redaction_leak_count": 0, + "qdrant_rebuild_case_count": 0, + "qdrant_rebuild_pass_count": 0, + "operator_debug_job_count": 0, + "raw_sql_needed_count": 0, + "trace_incomplete_count": 0, + "operator_ux_gap_count": 0, + "consolidation": { + "proposal_count": 0, + "proposal_usefulness": null, + "lineage_completeness": null, + "review_action_correctness": null, + "source_mutation_count": 0, + "proposal_unsupported_claim_count": 0, + "executable_gap_count": 0 + }, + "memory_summary": { + "job_count": 1, + "summary_count": 1, + "entry_count": 7, + "required_category_count": 6, + "covered_required_category_count": 6, + "missing_required_category_count": 0, + "top_of_mind_count": 1, + "background_count": 1, + "stale_count": 1, + "superseded_count": 1, + "tombstone_count": 1, + "derived_project_profile_count": 2, + "source_ref_required_count": 6, + "source_ref_entry_count": 6, + "source_ref_coverage": 1.0, + "freshness_marker_count": 7, + "freshness_coverage": 1.0, + "rationale_count": 7, + "rationale_coverage": 1.0, + "invalid_top_of_mind_count": 0, + "untraced_entry_count": 0, + "derived_with_source_or_unsupported_count": 2, + "derived_missing_source_or_unsupported_count": 0, + "unsupported_derived_entry_count": 1, + "unsupported_current_entry_count": 0, + "tombstone_ref_count": 1, + "source_trace_selected_count": 2, + "source_trace_dropped_count": 1, + "source_trace_stale_count": 1, + "source_trace_superseded_count": 1, + "source_trace_tombstone_count": 1 + }, + "proactive_brief": { + "job_count": 4, + "brief_count": 4, + "suggestion_count": 5, + "required_suggestion_kind_count": 4, + "covered_required_suggestion_kind_count": 4, + "missing_required_suggestion_kind_count": 0, + "evidence_ref_required_count": 5, + "evidence_ref_suggestion_count": 5, + "evidence_ref_coverage": 1.0, + "freshness_marker_count": 5, + "freshness_coverage": 1.0, + "action_rationale_count": 5, + "action_rationale_coverage": 1.0, + "recommended_count": 2, + "deferred_count": 2, + "rejected_count": 1, + "current_suggestion_count": 2, + "non_current_suggestion_count": 3, + "stale_warning_count": 3, + "invalid_current_suggestion_count": 0, + "untraced_suggestion_count": 0, + "unsupported_current_suggestion_count": 0, + "tombstone_violation_count": 0, + "source_trace_selected_count": 7, + "source_trace_dropped_count": 0, + "source_trace_stale_count": 2, + "source_trace_superseded_count": 2, + "source_trace_tombstone_count": 1 + }, + "scheduled_memory": { + "job_count": 4, + "task_run_count": 4, + "output_count": 5, + "required_task_kind_count": 4, + "covered_required_task_kind_count": 4, + "missing_required_task_kind_count": 0, + "evidence_ref_required_count": 5, + "evidence_ref_output_count": 5, + "evidence_ref_coverage": 1.0, + "freshness_marker_count": 5, + "freshness_coverage": 1.0, + "action_rationale_count": 5, + "action_rationale_coverage": 1.0, + "trace_required_count": 4, + "trace_complete_count": 4, + "trace_coverage": 1.0, + "source_mutation_count": 0, + "current_output_count": 2, + "non_current_output_count": 3, + "invalid_current_output_count": 0, + "untraced_output_count": 0, + "unsupported_current_output_count": 0, + "tombstone_violation_count": 0, + "source_trace_selected_count": 7, + "source_trace_dropped_count": 0, + "source_trace_stale_count": 2, + "source_trace_superseded_count": 3, + "source_trace_tombstone_count": 1 + } + }, + "suites": [ + { + "suite_id": "trust_source_of_truth", + "status": "not_encoded", + "encoded_job_count": 0, + "score_mean": null, + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 0, + "update_rationale_available_count": 0, + "temporal_validity_not_encoded_count": 0, + "history_readback_encoded_count": 0, + "expected_evidence_recall": null, + "irrelevant_context_ratio": null, + "trace_explainability_count": 0, + "reason": "No checked-in real_world_job fixture is encoded for this suite." + }, + { + "suite_id": "work_resume", + "status": "not_encoded", + "encoded_job_count": 0, + "score_mean": null, + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 0, + "update_rationale_available_count": 0, + "temporal_validity_not_encoded_count": 0, + "history_readback_encoded_count": 0, + "expected_evidence_recall": null, + "irrelevant_context_ratio": null, + "trace_explainability_count": 0, + "reason": "No checked-in real_world_job fixture is encoded for this suite." + }, + { + "suite_id": "project_decisions", + "status": "not_encoded", + "encoded_job_count": 0, + "score_mean": null, + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 0, + "update_rationale_available_count": 0, + "temporal_validity_not_encoded_count": 0, + "history_readback_encoded_count": 0, + "expected_evidence_recall": null, + "irrelevant_context_ratio": null, + "trace_explainability_count": 0, + "reason": "No checked-in real_world_job fixture is encoded for this suite." + }, + { + "suite_id": "retrieval", + "status": "not_encoded", + "encoded_job_count": 0, + "score_mean": null, + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 0, + "update_rationale_available_count": 0, + "temporal_validity_not_encoded_count": 0, + "history_readback_encoded_count": 0, + "expected_evidence_recall": null, + "irrelevant_context_ratio": null, + "trace_explainability_count": 0, + "reason": "No checked-in real_world_job fixture is encoded for this suite." + }, + { + "suite_id": "memory_evolution", + "status": "not_encoded", + "encoded_job_count": 0, + "score_mean": null, + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 0, + "update_rationale_available_count": 0, + "temporal_validity_not_encoded_count": 0, + "history_readback_encoded_count": 0, + "expected_evidence_recall": null, + "irrelevant_context_ratio": null, + "trace_explainability_count": 0, + "reason": "No checked-in real_world_job fixture is encoded for this suite." + }, + { + "suite_id": "consolidation", + "status": "not_encoded", + "encoded_job_count": 0, + "score_mean": null, + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 0, + "update_rationale_available_count": 0, + "temporal_validity_not_encoded_count": 0, + "history_readback_encoded_count": 0, + "expected_evidence_recall": null, + "irrelevant_context_ratio": null, + "trace_explainability_count": 0, + "reason": "No checked-in real_world_job fixture is encoded for this suite." + }, + { + "suite_id": "memory_summary", + "status": "pass", + "encoded_job_count": 1, + "score_mean": 1.0, + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 0, + "update_rationale_available_count": 0, + "temporal_validity_not_encoded_count": 0, + "history_readback_encoded_count": 0, + "expected_evidence_recall": 1.0, + "irrelevant_context_ratio": 0.0, + "trace_explainability_count": 1, + "reason": "All 1 encoded job(s) passed." + }, + { + "suite_id": "proactive_brief", + "status": "blocked", + "encoded_job_count": 5, + "score_mean": 0.8, + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 0, + "update_rationale_available_count": 0, + "temporal_validity_not_encoded_count": 0, + "history_readback_encoded_count": 0, + "expected_evidence_recall": 1.0, + "irrelevant_context_ratio": 0.0, + "trace_explainability_count": 5, + "reason": "At least one encoded job is blocked." + }, + { + "suite_id": "scheduled_memory", + "status": "blocked", + "encoded_job_count": 5, + "score_mean": 0.8, + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 0, + "update_rationale_available_count": 0, + "temporal_validity_not_encoded_count": 0, + "history_readback_encoded_count": 0, + "expected_evidence_recall": 1.0, + "irrelevant_context_ratio": 0.0, + "trace_explainability_count": 5, + "reason": "At least one encoded job is blocked." + }, + { + "suite_id": "knowledge_compilation", + "status": "not_encoded", + "encoded_job_count": 0, + "score_mean": null, + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 0, + "update_rationale_available_count": 0, + "temporal_validity_not_encoded_count": 0, + "history_readback_encoded_count": 0, + "expected_evidence_recall": null, + "irrelevant_context_ratio": null, + "trace_explainability_count": 0, + "reason": "No checked-in real_world_job fixture is encoded for this suite." + }, + { + "suite_id": "operator_debugging_ux", + "status": "not_encoded", + "encoded_job_count": 0, + "score_mean": null, + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 0, + "update_rationale_available_count": 0, + "temporal_validity_not_encoded_count": 0, + "history_readback_encoded_count": 0, + "expected_evidence_recall": null, + "irrelevant_context_ratio": null, + "trace_explainability_count": 0, + "reason": "No checked-in real_world_job fixture is encoded for this suite." + }, + { + "suite_id": "capture_integration", + "status": "not_encoded", + "encoded_job_count": 0, + "score_mean": null, + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 0, + "update_rationale_available_count": 0, + "temporal_validity_not_encoded_count": 0, + "history_readback_encoded_count": 0, + "expected_evidence_recall": null, + "irrelevant_context_ratio": null, + "trace_explainability_count": 0, + "reason": "No checked-in real_world_job fixture is encoded for this suite." + }, + { + "suite_id": "production_ops", + "status": "not_encoded", + "encoded_job_count": 0, + "score_mean": null, + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 0, + "update_rationale_available_count": 0, + "temporal_validity_not_encoded_count": 0, + "history_readback_encoded_count": 0, + "expected_evidence_recall": null, + "irrelevant_context_ratio": null, + "trace_explainability_count": 0, + "reason": "No checked-in real_world_job fixture is encoded for this suite." + }, + { + "suite_id": "personalization", + "status": "not_encoded", + "encoded_job_count": 0, + "score_mean": null, + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 0, + "update_rationale_available_count": 0, + "temporal_validity_not_encoded_count": 0, + "history_readback_encoded_count": 0, + "expected_evidence_recall": null, + "irrelevant_context_ratio": null, + "trace_explainability_count": 0, + "reason": "No checked-in real_world_job fixture is encoded for this suite." + }, + { + "suite_id": "core_archival_memory", + "status": "not_encoded", + "encoded_job_count": 0, + "score_mean": null, + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 0, + "update_rationale_available_count": 0, + "temporal_validity_not_encoded_count": 0, + "history_readback_encoded_count": 0, + "expected_evidence_recall": null, + "irrelevant_context_ratio": null, + "trace_explainability_count": 0, + "reason": "No checked-in real_world_job fixture is encoded for this suite." + }, + { + "suite_id": "context_trajectory", + "status": "not_encoded", + "encoded_job_count": 0, + "score_mean": null, + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 0, + "update_rationale_available_count": 0, + "temporal_validity_not_encoded_count": 0, + "history_readback_encoded_count": 0, + "expected_evidence_recall": null, + "irrelevant_context_ratio": null, + "trace_explainability_count": 0, + "reason": "No checked-in real_world_job fixture is encoded for this suite." + } + ], + "jobs": [ + { + "suite_id": "memory_summary", + "job_id": "memory-summary-source-trace-001", + "title": "Read back a reviewable current memory summary with source trace", + "status": "pass", + "answer_type": "reviewable_memory_summary", + "requires_caveat": false, + "requires_refusal": false, + "can_answer_unknown": false, + "normalized_score": 1.0, + "hard_fail_hits": [], + "expected_evidence": [ + { + "evidence_id": "summary-contract-current", + "claim_id": "summary_contract_reviewable", + "requirement": "cite" + }, + { + "evidence_id": "xy952-summary-contract", + "claim_id": "summary_stage_now_fixture_backed", + "requirement": "cite" + }, + { + "evidence_id": "summary-ttl-tombstone", + "claim_id": "summary_preserves_tombstone", + "requirement": "cite" + }, + { + "evidence_id": "summary-contract-non-parity-boundary", + "claim_id": "summary_excludes_unsupported_parity", + "requirement": "cite" + } + ], + "produced_answer": "Memory summaries now use a reviewable source-trace contract. Postgres remains authoritative while Qdrant remains a rebuildable derived index. The old memory-summary stage state was not_tested before XY-952. The pre-XY-905 live memory_evolution loss is historical. The fixture-only managed-memory parity claim is tombstoned and excluded. Project profile: ELF summaries are reviewable derived readback, not authoritative notes. Excluded candidate: the local summary contract proves parity with managed memory products.", + "produced_evidence": [ + "summary-contract-current", + "summary-contract-non-parity-boundary", + "summary-ttl-tombstone", + "xy952-summary-contract" + ], + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 0, + "update_rationale_available": false, + "temporal_validity_not_encoded": false, + "history_readback_encoded": false, + "retrieval_quality": { + "expected_evidence_total": 4, + "expected_evidence_matched": 4, + "expected_evidence_recall": 1.0, + "produced_evidence_total": 4, + "irrelevant_context_count": 0, + "irrelevant_context_ratio": 0.0, + "trap_context_count": 0 + }, + "latency_ms": 51.676775, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + }, + "trace_explainability": { + "trace_id": "2e80669d-2bcf-4238-b780-9b42aa72d2a2", + "stages": [ + { + "stage_name": "dreaming_readback.service_list", + "kept_evidence": [ + "stale-summary-gap", + "summary-background-sot", + "summary-contract-current", + "summary-contract-non-parity-boundary", + "summary-temporary-claim", + "summary-ttl-tombstone", + "superseded-live-evolution-loss", + "xy952-summary-contract" + ], + "dropped_evidence": [], + "demoted_evidence": [], + "distractor_evidence": [], + "notes": "Read 8 source refs from ElfService::list for memory_summary." + }, + { + "stage_name": "dreaming_readback.source_mutation_guard", + "kept_evidence": [ + "stale-summary-gap", + "summary-background-sot", + "summary-contract-current", + "summary-contract-non-parity-boundary", + "summary-temporary-claim", + "summary-ttl-tombstone", + "superseded-live-evolution-loss", + "xy952-summary-contract" + ], + "dropped_evidence": [], + "demoted_evidence": [], + "distractor_evidence": [], + "notes": "Generated readback artifacts without mutating source notes." + } + ] + }, + "memory_summary": { + "summary_count": 1, + "entry_count": 7, + "required_category_count": 6, + "covered_required_category_count": 6, + "missing_required_category_count": 0, + "top_of_mind_count": 1, + "background_count": 1, + "stale_count": 1, + "superseded_count": 1, + "tombstone_count": 1, + "derived_project_profile_count": 2, + "source_ref_required_count": 6, + "source_ref_entry_count": 6, + "source_ref_coverage": 1.0, + "freshness_marker_count": 7, + "freshness_coverage": 1.0, + "rationale_count": 7, + "rationale_coverage": 1.0, + "invalid_top_of_mind_count": 0, + "untraced_entry_count": 0, + "derived_with_source_or_unsupported_count": 2, + "derived_missing_source_or_unsupported_count": 0, + "unsupported_derived_entry_count": 1, + "unsupported_current_entry_count": 0, + "tombstone_ref_count": 1, + "source_trace_selected_count": 2, + "source_trace_dropped_count": 1, + "source_trace_stale_count": 1, + "source_trace_superseded_count": 1, + "source_trace_tombstone_count": 1 + }, + "trap_ids_used": [], + "dimension_scores": [ + { + "dimension": "answer_correctness", + "score": 1.0, + "max_points": 1.0, + "weight": 0.2 + }, + { + "dimension": "evidence_grounding", + "score": 1.0, + "max_points": 1.0, + "weight": 0.25 + }, + { + "dimension": "lifecycle_behavior", + "score": 1.0, + "max_points": 1.0, + "weight": 0.25 + }, + { + "dimension": "trap_avoidance", + "score": 1.0, + "max_points": 1.0, + "weight": 0.15 + }, + { + "dimension": "uncertainty_handling", + "score": 1.0, + "max_points": 1.0, + "weight": 0.15 + } + ], + "reason": "Job passed with normalized_score 1.000.", + "evidence_required_count": 4, + "evidence_covered_count": 4, + "source_ref_required_count": 4, + "source_ref_covered_count": 4, + "quote_required_count": 4, + "quote_covered_count": 4, + "stale_retrieval_count": 0, + "scope_check_count": 0, + "scope_correct_count": 0, + "scope_violation_count": 0, + "redaction_leak_count": 0, + "qdrant_rebuild_case": false + }, + { + "suite_id": "proactive_brief", + "job_id": "proactive-daily-project-brief-001", + "title": "Generate a daily project brief from current project memory", + "status": "pass", + "answer_type": "proactive_project_brief", + "requires_caveat": false, + "requires_refusal": false, + "can_answer_unknown": true, + "normalized_score": 1.0, + "hard_fail_hits": [], + "expected_evidence": [ + { + "evidence_id": "daily-current-validation-gate", + "claim_id": "daily_validation_gate", + "requirement": "cite" + }, + { + "evidence_id": "daily-current-ledger-update", + "claim_id": "daily_ledger_update", + "requirement": "cite" + } + ], + "produced_answer": "Run the proactive brief benchmark gate Run the proactive brief fixture command before claiming the lane is validation-ready, then update the XY-951 ledger.", + "produced_evidence": [ + "daily-current-ledger-update", + "daily-current-validation-gate" + ], + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 0, + "update_rationale_available": false, + "temporal_validity_not_encoded": false, + "history_readback_encoded": false, + "retrieval_quality": { + "expected_evidence_total": 2, + "expected_evidence_matched": 2, + "expected_evidence_recall": 1.0, + "produced_evidence_total": 2, + "irrelevant_context_count": 0, + "irrelevant_context_ratio": 0.0, + "trap_context_count": 0 + }, + "latency_ms": 6.884306, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + }, + "trace_explainability": { + "trace_id": "fc854889-2ac4-436b-a885-b43053922cb9", + "stages": [ + { + "stage_name": "dreaming_readback.service_list", + "kept_evidence": [ + "daily-current-ledger-update", + "daily-current-validation-gate", + "daily-old-parity-trap" + ], + "dropped_evidence": [], + "demoted_evidence": [], + "distractor_evidence": [], + "notes": "Read 3 source refs from ElfService::list for proactive_brief." + }, + { + "stage_name": "dreaming_readback.source_mutation_guard", + "kept_evidence": [ + "daily-current-ledger-update", + "daily-current-validation-gate", + "daily-old-parity-trap" + ], + "dropped_evidence": [], + "demoted_evidence": [], + "distractor_evidence": [], + "notes": "Generated readback artifacts without mutating source notes." + } + ] + }, + "proactive_brief": { + "brief_count": 1, + "suggestion_count": 1, + "required_suggestion_kind_count": 1, + "covered_required_suggestion_kind_count": 1, + "missing_required_suggestion_kind_count": 0, + "evidence_ref_required_count": 1, + "evidence_ref_suggestion_count": 1, + "evidence_ref_coverage": 1.0, + "freshness_marker_count": 1, + "freshness_coverage": 1.0, + "action_rationale_count": 1, + "action_rationale_coverage": 1.0, + "recommended_count": 1, + "deferred_count": 0, + "rejected_count": 0, + "current_suggestion_count": 1, + "non_current_suggestion_count": 0, + "stale_warning_count": 0, + "invalid_current_suggestion_count": 0, + "untraced_suggestion_count": 0, + "unsupported_current_suggestion_count": 0, + "tombstone_violation_count": 0, + "source_trace_selected_count": 2, + "source_trace_dropped_count": 0, + "source_trace_stale_count": 1, + "source_trace_superseded_count": 0, + "source_trace_tombstone_count": 0 + }, + "trap_ids_used": [], + "dimension_scores": [ + { + "dimension": "answer_correctness", + "score": 1.0, + "max_points": 1.0, + "weight": 0.25 + }, + { + "dimension": "evidence_grounding", + "score": 1.0, + "max_points": 1.0, + "weight": 0.3 + }, + { + "dimension": "lifecycle_behavior", + "score": 1.0, + "max_points": 1.0, + "weight": 0.1 + }, + { + "dimension": "trap_avoidance", + "score": 1.0, + "max_points": 1.0, + "weight": 0.2 + }, + { + "dimension": "workflow_helpfulness", + "score": 1.0, + "max_points": 1.0, + "weight": 0.15 + } + ], + "reason": "Job passed with normalized_score 1.000.", + "evidence_required_count": 2, + "evidence_covered_count": 2, + "source_ref_required_count": 2, + "source_ref_covered_count": 2, + "quote_required_count": 2, + "quote_covered_count": 2, + "stale_retrieval_count": 0, + "scope_check_count": 0, + "scope_correct_count": 0, + "scope_violation_count": 0, + "redaction_leak_count": 0, + "qdrant_rebuild_case": false + }, + { + "suite_id": "proactive_brief", + "job_id": "proactive-private-corpus-refresh-blocked-001", + "title": "Block private-corpus refresh suggestions when no operator manifest exists", + "status": "blocked", + "answer_type": "proactive_project_brief", + "requires_caveat": true, + "requires_refusal": true, + "can_answer_unknown": true, + "normalized_score": 0.0, + "hard_fail_hits": [], + "expected_evidence": [], + "produced_answer": "", + "produced_evidence": [], + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 0, + "update_rationale_available": false, + "temporal_validity_not_encoded": false, + "history_readback_encoded": false, + "retrieval_quality": { + "expected_evidence_total": 0, + "expected_evidence_matched": 0, + "expected_evidence_recall": 1.0, + "produced_evidence_total": 0, + "irrelevant_context_count": 0, + "irrelevant_context_ratio": 0.0, + "trap_context_count": 0 + }, + "latency_ms": 0.0, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + }, + "trace_explainability": { + "failure_stage": "live_adapter.suite_support", + "failure_reason": "No operator-owned private production corpus manifest is available; private-corpus refresh suggestions stay blocked under XY-930.", + "stages": [ + { + "stage_name": "live_adapter.suite_support", + "kept_evidence": [], + "dropped_evidence": [], + "demoted_evidence": [], + "distractor_evidence": [], + "notes": "No operator-owned private production corpus manifest is available; private-corpus refresh suggestions stay blocked under XY-930." + } + ] + }, + "trap_ids_used": [], + "dimension_scores": [ + { + "dimension": "answer_correctness", + "score": 0.0, + "max_points": 1.0, + "weight": 0.3 + }, + { + "dimension": "evidence_grounding", + "score": 0.0, + "max_points": 1.0, + "weight": 0.3 + }, + { + "dimension": "lifecycle_behavior", + "score": 0.0, + "max_points": 1.0, + "weight": 0.15 + }, + { + "dimension": "uncertainty_handling", + "score": 0.0, + "max_points": 1.0, + "weight": 0.25 + } + ], + "reason": "No operator-owned private production corpus manifest is available; private-corpus refresh suggestions stay blocked under XY-930.", + "evidence_required_count": 0, + "evidence_covered_count": 0, + "source_ref_required_count": 0, + "source_ref_covered_count": 0, + "quote_required_count": 0, + "quote_covered_count": 0, + "stale_retrieval_count": 0, + "scope_check_count": 0, + "scope_correct_count": 0, + "scope_violation_count": 0, + "redaction_leak_count": 0, + "qdrant_rebuild_case": false + }, + { + "suite_id": "proactive_brief", + "job_id": "proactive-resume-work-brief-001", + "title": "Generate a resume-work brief from current handoff memory", + "status": "pass", + "answer_type": "proactive_project_brief", + "requires_caveat": false, + "requires_refusal": false, + "can_answer_unknown": true, + "normalized_score": 1.0, + "hard_fail_hits": [], + "expected_evidence": [ + { + "evidence_id": "resume-current-handoff", + "claim_id": "resume_current_handoff", + "requirement": "cite" + }, + { + "evidence_id": "resume-current-validation", + "claim_id": "resume_validation", + "requirement": "cite" + } + ], + "produced_answer": "Continue proactive brief scoring Continue the XY-953 fixture and runner scoring work on y/elf-xy-953, then run the proactive brief benchmark command.", + "produced_evidence": [ + "resume-current-handoff", + "resume-current-validation" + ], + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 0, + "update_rationale_available": false, + "temporal_validity_not_encoded": false, + "history_readback_encoded": false, + "retrieval_quality": { + "expected_evidence_total": 2, + "expected_evidence_matched": 2, + "expected_evidence_recall": 1.0, + "produced_evidence_total": 2, + "irrelevant_context_count": 0, + "irrelevant_context_ratio": 0.0, + "trap_context_count": 0 + }, + "latency_ms": 7.336724, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + }, + "trace_explainability": { + "trace_id": "c77d3ddb-d0c0-4168-a528-a585adfc8a7f", + "stages": [ + { + "stage_name": "dreaming_readback.service_list", + "kept_evidence": [ + "resume-current-handoff", + "resume-current-validation", + "resume-stale-validation" + ], + "dropped_evidence": [], + "demoted_evidence": [], + "distractor_evidence": [], + "notes": "Read 3 source refs from ElfService::list for proactive_brief." + }, + { + "stage_name": "dreaming_readback.source_mutation_guard", + "kept_evidence": [ + "resume-current-handoff", + "resume-current-validation", + "resume-stale-validation" + ], + "dropped_evidence": [], + "demoted_evidence": [], + "distractor_evidence": [], + "notes": "Generated readback artifacts without mutating source notes." + } + ] + }, + "proactive_brief": { + "brief_count": 1, + "suggestion_count": 1, + "required_suggestion_kind_count": 1, + "covered_required_suggestion_kind_count": 1, + "missing_required_suggestion_kind_count": 0, + "evidence_ref_required_count": 1, + "evidence_ref_suggestion_count": 1, + "evidence_ref_coverage": 1.0, + "freshness_marker_count": 1, + "freshness_coverage": 1.0, + "action_rationale_count": 1, + "action_rationale_coverage": 1.0, + "recommended_count": 1, + "deferred_count": 0, + "rejected_count": 0, + "current_suggestion_count": 1, + "non_current_suggestion_count": 0, + "stale_warning_count": 0, + "invalid_current_suggestion_count": 0, + "untraced_suggestion_count": 0, + "unsupported_current_suggestion_count": 0, + "tombstone_violation_count": 0, + "source_trace_selected_count": 2, + "source_trace_dropped_count": 0, + "source_trace_stale_count": 1, + "source_trace_superseded_count": 0, + "source_trace_tombstone_count": 0 + }, + "trap_ids_used": [], + "dimension_scores": [ + { + "dimension": "answer_correctness", + "score": 1.0, + "max_points": 1.0, + "weight": 0.25 + }, + { + "dimension": "evidence_grounding", + "score": 1.0, + "max_points": 1.0, + "weight": 0.3 + }, + { + "dimension": "lifecycle_behavior", + "score": 1.0, + "max_points": 1.0, + "weight": 0.1 + }, + { + "dimension": "trap_avoidance", + "score": 1.0, + "max_points": 1.0, + "weight": 0.2 + }, + { + "dimension": "workflow_helpfulness", + "score": 1.0, + "max_points": 1.0, + "weight": 0.15 + } + ], + "reason": "Job passed with normalized_score 1.000.", + "evidence_required_count": 2, + "evidence_covered_count": 2, + "source_ref_required_count": 2, + "source_ref_covered_count": 2, + "quote_required_count": 2, + "quote_covered_count": 2, + "stale_retrieval_count": 0, + "scope_check_count": 0, + "scope_correct_count": 0, + "scope_violation_count": 0, + "redaction_leak_count": 0, + "qdrant_rebuild_case": false + }, + { + "suite_id": "proactive_brief", + "job_id": "proactive-stale-decision-audit-001", + "title": "Warn about a stale project decision before suggesting work", + "status": "pass", + "answer_type": "proactive_project_brief", + "requires_caveat": false, + "requires_refusal": false, + "can_answer_unknown": true, + "normalized_score": 1.0, + "hard_fail_hits": [], + "expected_evidence": [ + { + "evidence_id": "stale-decision-old-gate", + "claim_id": "stale_decision_replaced", + "requirement": "cite" + }, + { + "evidence_id": "stale-decision-new-gate", + "claim_id": "stale_decision_replaced", + "requirement": "cite" + } + ], + "produced_answer": "Defer the old operator-ux-only readiness gate Do not use the old operator-ux-only decision as current readiness evidence; it is superseded by the direct proactive brief suite.", + "produced_evidence": [ + "stale-decision-new-gate", + "stale-decision-old-gate" + ], + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 0, + "update_rationale_available": false, + "temporal_validity_not_encoded": false, + "history_readback_encoded": false, + "retrieval_quality": { + "expected_evidence_total": 2, + "expected_evidence_matched": 2, + "expected_evidence_recall": 1.0, + "produced_evidence_total": 2, + "irrelevant_context_count": 0, + "irrelevant_context_ratio": 0.0, + "trap_context_count": 1 + }, + "latency_ms": 9.269811, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + }, + "trace_explainability": { + "trace_id": "d7decd9a-d635-41b5-9dcc-c6e3c5c44fb7", + "stages": [ + { + "stage_name": "dreaming_readback.service_list", + "kept_evidence": [ + "stale-decision-new-gate", + "stale-decision-old-gate" + ], + "dropped_evidence": [], + "demoted_evidence": [], + "distractor_evidence": [], + "notes": "Read 2 source refs from ElfService::list for proactive_brief." + }, + { + "stage_name": "dreaming_readback.source_mutation_guard", + "kept_evidence": [ + "stale-decision-new-gate", + "stale-decision-old-gate" + ], + "dropped_evidence": [], + "demoted_evidence": [], + "distractor_evidence": [], + "notes": "Generated readback artifacts without mutating source notes." + } + ] + }, + "proactive_brief": { + "brief_count": 1, + "suggestion_count": 1, + "required_suggestion_kind_count": 1, + "covered_required_suggestion_kind_count": 1, + "missing_required_suggestion_kind_count": 0, + "evidence_ref_required_count": 1, + "evidence_ref_suggestion_count": 1, + "evidence_ref_coverage": 1.0, + "freshness_marker_count": 1, + "freshness_coverage": 1.0, + "action_rationale_count": 1, + "action_rationale_coverage": 1.0, + "recommended_count": 0, + "deferred_count": 1, + "rejected_count": 0, + "current_suggestion_count": 0, + "non_current_suggestion_count": 1, + "stale_warning_count": 1, + "invalid_current_suggestion_count": 0, + "untraced_suggestion_count": 0, + "unsupported_current_suggestion_count": 0, + "tombstone_violation_count": 0, + "source_trace_selected_count": 1, + "source_trace_dropped_count": 0, + "source_trace_stale_count": 0, + "source_trace_superseded_count": 1, + "source_trace_tombstone_count": 0 + }, + "trap_ids_used": [], + "dimension_scores": [ + { + "dimension": "answer_correctness", + "score": 1.0, + "max_points": 1.0, + "weight": 0.25 + }, + { + "dimension": "evidence_grounding", + "score": 1.0, + "max_points": 1.0, + "weight": 0.3 + }, + { + "dimension": "lifecycle_behavior", + "score": 1.0, + "max_points": 1.0, + "weight": 0.1 + }, + { + "dimension": "trap_avoidance", + "score": 1.0, + "max_points": 1.0, + "weight": 0.2 + }, + { + "dimension": "workflow_helpfulness", + "score": 1.0, + "max_points": 1.0, + "weight": 0.15 + } + ], + "reason": "Job passed with normalized_score 1.000.", + "evidence_required_count": 2, + "evidence_covered_count": 2, + "source_ref_required_count": 2, + "source_ref_covered_count": 2, + "quote_required_count": 2, + "quote_covered_count": 2, + "stale_retrieval_count": 0, + "scope_check_count": 0, + "scope_correct_count": 0, + "scope_violation_count": 0, + "redaction_leak_count": 0, + "qdrant_rebuild_case": false + }, + { + "suite_id": "proactive_brief", + "job_id": "proactive-stale-plan-preference-warning-001", + "title": "Reject stale plan and preference suggestions after TTL invalidation", + "status": "pass", + "answer_type": "proactive_project_brief", + "requires_caveat": false, + "requires_refusal": false, + "can_answer_unknown": true, + "normalized_score": 1.0, + "hard_fail_hits": [], + "expected_evidence": [ + { + "evidence_id": "stale-plan-ttl", + "claim_id": "stale_plan_rejected", + "requirement": "cite" + }, + { + "evidence_id": "current-preference-concise-brief", + "claim_id": "current_preference_concise", + "requirement": "cite" + } + ], + "produced_answer": "Reject the expired publish-first plan Do not publish the proactive report before running the new proactive brief benchmark; the old plan expired under TTL. Defer long product-comparison prose Use concise evidence-linked proactive briefs and avoid broad hosted-product parity claims.", + "produced_evidence": [ + "current-plan-run-gate", + "current-preference-concise-brief", + "old-preference-long-brief", + "stale-plan-old", + "stale-plan-ttl" + ], + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 0, + "update_rationale_available": false, + "temporal_validity_not_encoded": false, + "history_readback_encoded": false, + "retrieval_quality": { + "expected_evidence_total": 2, + "expected_evidence_matched": 2, + "expected_evidence_recall": 1.0, + "produced_evidence_total": 5, + "irrelevant_context_count": 0, + "irrelevant_context_ratio": 0.0, + "trap_context_count": 1 + }, + "latency_ms": 7.991892, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + }, + "trace_explainability": { + "trace_id": "f2e795b5-7ac4-4f7d-ab49-75392f6ba8a8", + "stages": [ + { + "stage_name": "dreaming_readback.service_list", + "kept_evidence": [ + "current-plan-run-gate", + "current-preference-concise-brief", + "old-preference-long-brief", + "stale-plan-old", + "stale-plan-ttl" + ], + "dropped_evidence": [], + "demoted_evidence": [], + "distractor_evidence": [], + "notes": "Read 5 source refs from ElfService::list for proactive_brief." + }, + { + "stage_name": "dreaming_readback.source_mutation_guard", + "kept_evidence": [ + "current-plan-run-gate", + "current-preference-concise-brief", + "old-preference-long-brief", + "stale-plan-old", + "stale-plan-ttl" + ], + "dropped_evidence": [], + "demoted_evidence": [], + "distractor_evidence": [], + "notes": "Generated readback artifacts without mutating source notes." + } + ] + }, + "proactive_brief": { + "brief_count": 1, + "suggestion_count": 2, + "required_suggestion_kind_count": 1, + "covered_required_suggestion_kind_count": 1, + "missing_required_suggestion_kind_count": 0, + "evidence_ref_required_count": 2, + "evidence_ref_suggestion_count": 2, + "evidence_ref_coverage": 1.0, + "freshness_marker_count": 2, + "freshness_coverage": 1.0, + "action_rationale_count": 2, + "action_rationale_coverage": 1.0, + "recommended_count": 0, + "deferred_count": 1, + "rejected_count": 1, + "current_suggestion_count": 0, + "non_current_suggestion_count": 2, + "stale_warning_count": 2, + "invalid_current_suggestion_count": 0, + "untraced_suggestion_count": 0, + "unsupported_current_suggestion_count": 0, + "tombstone_violation_count": 0, + "source_trace_selected_count": 2, + "source_trace_dropped_count": 0, + "source_trace_stale_count": 0, + "source_trace_superseded_count": 1, + "source_trace_tombstone_count": 1 + }, + "trap_ids_used": [], + "dimension_scores": [ + { + "dimension": "answer_correctness", + "score": 1.0, + "max_points": 1.0, + "weight": 0.25 + }, + { + "dimension": "evidence_grounding", + "score": 1.0, + "max_points": 1.0, + "weight": 0.3 + }, + { + "dimension": "lifecycle_behavior", + "score": 1.0, + "max_points": 1.0, + "weight": 0.1 + }, + { + "dimension": "trap_avoidance", + "score": 1.0, + "max_points": 1.0, + "weight": 0.2 + }, + { + "dimension": "workflow_helpfulness", + "score": 1.0, + "max_points": 1.0, + "weight": 0.15 + } + ], + "reason": "Job passed with normalized_score 1.000.", + "evidence_required_count": 2, + "evidence_covered_count": 2, + "source_ref_required_count": 2, + "source_ref_covered_count": 2, + "quote_required_count": 2, + "quote_covered_count": 2, + "stale_retrieval_count": 0, + "scope_check_count": 0, + "scope_correct_count": 0, + "scope_violation_count": 0, + "redaction_leak_count": 0, + "qdrant_rebuild_case": false + }, + { + "suite_id": "scheduled_memory", + "job_id": "scheduled-knowledge-page-refresh-suggestion-001", + "title": "Suggest a knowledge-page refresh from scheduled memory", + "status": "pass", + "answer_type": "scheduled_memory_task", + "requires_caveat": false, + "requires_refusal": false, + "can_answer_unknown": true, + "normalized_score": 1.0, + "hard_fail_hits": [], + "expected_evidence": [ + { + "evidence_id": "scheduled-knowledge-page-stale-finding", + "claim_id": "scheduled_knowledge_refresh_suggested", + "requirement": "cite" + }, + { + "evidence_id": "scheduled-knowledge-reviewable-refresh", + "claim_id": "scheduled_knowledge_refresh_suggested", + "requirement": "cite" + } + ], + "produced_answer": "Suggest a reviewable knowledge-page rebuild for the stale scheduled-memory blocked-state reference; do not rewrite source notes silently.", + "produced_evidence": [ + "scheduled-knowledge-page-stale-finding", + "scheduled-knowledge-reviewable-refresh" + ], + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 0, + "update_rationale_available": false, + "temporal_validity_not_encoded": false, + "history_readback_encoded": false, + "retrieval_quality": { + "expected_evidence_total": 2, + "expected_evidence_matched": 2, + "expected_evidence_recall": 1.0, + "produced_evidence_total": 2, + "irrelevant_context_count": 0, + "irrelevant_context_ratio": 0.0, + "trap_context_count": 0 + }, + "latency_ms": 6.31843, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + }, + "trace_explainability": { + "trace_id": "df5b34bc-b8bd-427c-a531-7c37ff2444c8", + "stages": [ + { + "stage_name": "dreaming_readback.service_list", + "kept_evidence": [ + "scheduled-knowledge-page-stale-finding", + "scheduled-knowledge-reviewable-refresh", + "scheduled-knowledge-silent-rewrite-trap" + ], + "dropped_evidence": [], + "demoted_evidence": [], + "distractor_evidence": [], + "notes": "Read 3 source refs from ElfService::list for scheduled_memory." + }, + { + "stage_name": "dreaming_readback.source_mutation_guard", + "kept_evidence": [ + "scheduled-knowledge-page-stale-finding", + "scheduled-knowledge-reviewable-refresh", + "scheduled-knowledge-silent-rewrite-trap" + ], + "dropped_evidence": [], + "demoted_evidence": [], + "distractor_evidence": [], + "notes": "Generated readback artifacts without mutating source notes." + } + ] + }, + "scheduled_memory": { + "task_run_count": 1, + "output_count": 1, + "required_task_kind_count": 1, + "covered_required_task_kind_count": 1, + "missing_required_task_kind_count": 0, + "evidence_ref_required_count": 1, + "evidence_ref_output_count": 1, + "evidence_ref_coverage": 1.0, + "freshness_marker_count": 1, + "freshness_coverage": 1.0, + "action_rationale_count": 1, + "action_rationale_coverage": 1.0, + "trace_required_count": 1, + "trace_complete_count": 1, + "trace_coverage": 1.0, + "source_mutation_count": 0, + "current_output_count": 1, + "non_current_output_count": 0, + "invalid_current_output_count": 0, + "untraced_output_count": 0, + "unsupported_current_output_count": 0, + "tombstone_violation_count": 0, + "source_trace_selected_count": 2, + "source_trace_dropped_count": 0, + "source_trace_stale_count": 1, + "source_trace_superseded_count": 0, + "source_trace_tombstone_count": 0 + }, + "trap_ids_used": [], + "dimension_scores": [ + { + "dimension": "answer_correctness", + "score": 1.0, + "max_points": 1.0, + "weight": 0.25 + }, + { + "dimension": "evidence_grounding", + "score": 1.0, + "max_points": 1.0, + "weight": 0.25 + }, + { + "dimension": "source_immutability", + "score": 1.0, + "max_points": 1.0, + "weight": 0.15 + }, + { + "dimension": "trace_readback", + "score": 1.0, + "max_points": 1.0, + "weight": 0.2 + }, + { + "dimension": "trap_avoidance", + "score": 1.0, + "max_points": 1.0, + "weight": 0.15 + } + ], + "reason": "Job passed with normalized_score 1.000.", + "evidence_required_count": 2, + "evidence_covered_count": 2, + "source_ref_required_count": 2, + "source_ref_covered_count": 2, + "quote_required_count": 2, + "quote_covered_count": 2, + "stale_retrieval_count": 0, + "scope_check_count": 0, + "scope_correct_count": 0, + "scope_violation_count": 0, + "redaction_leak_count": 0, + "qdrant_rebuild_case": false + }, + { + "suite_id": "scheduled_memory", + "job_id": "scheduled-private-provider-scheduler-blocked-001", + "title": "Block private/provider scheduled tasks without operator inputs", + "status": "blocked", + "answer_type": "scheduled_memory_task", + "requires_caveat": true, + "requires_refusal": true, + "can_answer_unknown": true, + "normalized_score": 0.0, + "hard_fail_hits": [], + "expected_evidence": [], + "produced_answer": "", + "produced_evidence": [], + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 0, + "update_rationale_available": false, + "temporal_validity_not_encoded": false, + "history_readback_encoded": false, + "retrieval_quality": { + "expected_evidence_total": 0, + "expected_evidence_matched": 0, + "expected_evidence_recall": 1.0, + "produced_evidence_total": 0, + "irrelevant_context_count": 0, + "irrelevant_context_ratio": 0.0, + "trap_context_count": 0 + }, + "latency_ms": 0.0, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + }, + "trace_explainability": { + "failure_stage": "live_adapter.suite_support", + "failure_reason": "No operator-owned private production corpus manifest, provider credentials, or hosted scheduler configuration is available; private/provider scheduled tasks stay blocked under XY-930.", + "stages": [ + { + "stage_name": "live_adapter.suite_support", + "kept_evidence": [], + "dropped_evidence": [], + "demoted_evidence": [], + "distractor_evidence": [], + "notes": "No operator-owned private production corpus manifest, provider credentials, or hosted scheduler configuration is available; private/provider scheduled tasks stay blocked under XY-930." + } + ] + }, + "trap_ids_used": [], + "dimension_scores": [ + { + "dimension": "answer_correctness", + "score": 0.0, + "max_points": 1.0, + "weight": 0.3 + }, + { + "dimension": "evidence_grounding", + "score": 0.0, + "max_points": 1.0, + "weight": 0.3 + }, + { + "dimension": "lifecycle_behavior", + "score": 0.0, + "max_points": 1.0, + "weight": 0.15 + }, + { + "dimension": "uncertainty_handling", + "score": 0.0, + "max_points": 1.0, + "weight": 0.25 + } + ], + "reason": "No operator-owned private production corpus manifest, provider credentials, or hosted scheduler configuration is available; private/provider scheduled tasks stay blocked under XY-930.", + "evidence_required_count": 0, + "evidence_covered_count": 0, + "source_ref_required_count": 0, + "source_ref_covered_count": 0, + "quote_required_count": 0, + "quote_covered_count": 0, + "stale_retrieval_count": 0, + "scope_check_count": 0, + "scope_correct_count": 0, + "scope_violation_count": 0, + "redaction_leak_count": 0, + "qdrant_rebuild_case": false + }, + { + "suite_id": "scheduled_memory", + "job_id": "scheduled-stale-decision-audit-001", + "title": "Audit a stale project decision during a scheduled task", + "status": "pass", + "answer_type": "scheduled_memory_task", + "requires_caveat": false, + "requires_refusal": false, + "can_answer_unknown": true, + "normalized_score": 1.0, + "hard_fail_hits": [], + "expected_evidence": [ + { + "evidence_id": "scheduled-old-consolidation-only-decision", + "claim_id": "scheduled_decision_superseded", + "requirement": "cite" + }, + { + "evidence_id": "scheduled-current-direct-suite-decision", + "claim_id": "scheduled_decision_superseded", + "requirement": "cite" + } + ], + "produced_answer": "Defer the consolidation-only scheduled readiness decision; the current gate is the direct scheduled-memory fixture suite plus aggregate regression guard.", + "produced_evidence": [ + "scheduled-current-direct-suite-decision", + "scheduled-old-consolidation-only-decision" + ], + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 0, + "update_rationale_available": false, + "temporal_validity_not_encoded": false, + "history_readback_encoded": false, + "retrieval_quality": { + "expected_evidence_total": 2, + "expected_evidence_matched": 2, + "expected_evidence_recall": 1.0, + "produced_evidence_total": 2, + "irrelevant_context_count": 0, + "irrelevant_context_ratio": 0.0, + "trap_context_count": 1 + }, + "latency_ms": 5.7482619999999995, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + }, + "trace_explainability": { + "trace_id": "3ca5cf35-007e-4c15-9dce-3983a7053e9a", + "stages": [ + { + "stage_name": "dreaming_readback.service_list", + "kept_evidence": [ + "scheduled-current-direct-suite-decision", + "scheduled-old-consolidation-only-decision" + ], + "dropped_evidence": [], + "demoted_evidence": [], + "distractor_evidence": [], + "notes": "Read 2 source refs from ElfService::list for scheduled_memory." + }, + { + "stage_name": "dreaming_readback.source_mutation_guard", + "kept_evidence": [ + "scheduled-current-direct-suite-decision", + "scheduled-old-consolidation-only-decision" + ], + "dropped_evidence": [], + "demoted_evidence": [], + "distractor_evidence": [], + "notes": "Generated readback artifacts without mutating source notes." + } + ] + }, + "scheduled_memory": { + "task_run_count": 1, + "output_count": 1, + "required_task_kind_count": 1, + "covered_required_task_kind_count": 1, + "missing_required_task_kind_count": 0, + "evidence_ref_required_count": 1, + "evidence_ref_output_count": 1, + "evidence_ref_coverage": 1.0, + "freshness_marker_count": 1, + "freshness_coverage": 1.0, + "action_rationale_count": 1, + "action_rationale_coverage": 1.0, + "trace_required_count": 1, + "trace_complete_count": 1, + "trace_coverage": 1.0, + "source_mutation_count": 0, + "current_output_count": 0, + "non_current_output_count": 1, + "invalid_current_output_count": 0, + "untraced_output_count": 0, + "unsupported_current_output_count": 0, + "tombstone_violation_count": 0, + "source_trace_selected_count": 1, + "source_trace_dropped_count": 0, + "source_trace_stale_count": 0, + "source_trace_superseded_count": 1, + "source_trace_tombstone_count": 0 + }, + "trap_ids_used": [], + "dimension_scores": [ + { + "dimension": "answer_correctness", + "score": 1.0, + "max_points": 1.0, + "weight": 0.25 + }, + { + "dimension": "evidence_grounding", + "score": 1.0, + "max_points": 1.0, + "weight": 0.25 + }, + { + "dimension": "lifecycle_behavior", + "score": 1.0, + "max_points": 1.0, + "weight": 0.15 + }, + { + "dimension": "trace_readback", + "score": 1.0, + "max_points": 1.0, + "weight": 0.2 + }, + { + "dimension": "trap_avoidance", + "score": 1.0, + "max_points": 1.0, + "weight": 0.15 + } + ], + "reason": "Job passed with normalized_score 1.000.", + "evidence_required_count": 2, + "evidence_covered_count": 2, + "source_ref_required_count": 2, + "source_ref_covered_count": 2, + "quote_required_count": 2, + "quote_covered_count": 2, + "stale_retrieval_count": 0, + "scope_check_count": 0, + "scope_correct_count": 0, + "scope_violation_count": 0, + "redaction_leak_count": 0, + "qdrant_rebuild_case": false + }, + { + "suite_id": "scheduled_memory", + "job_id": "scheduled-stale-preference-plan-audit-001", + "title": "Audit stale preferences and plans during a scheduled task", + "status": "pass", + "answer_type": "scheduled_memory_task", + "requires_caveat": false, + "requires_refusal": false, + "can_answer_unknown": true, + "normalized_score": 1.0, + "hard_fail_hits": [], + "expected_evidence": [ + { + "evidence_id": "scheduled-stale-old-plan", + "claim_id": "scheduled_stale_plan_expired", + "requirement": "cite" + }, + { + "evidence_id": "scheduled-stale-plan-expired", + "claim_id": "scheduled_stale_plan_expired", + "requirement": "cite" + }, + { + "evidence_id": "scheduled-current-trace-plan", + "claim_id": "scheduled_stale_plan_expired", + "requirement": "cite" + }, + { + "evidence_id": "scheduled-current-reviewable-preference", + "claim_id": "scheduled_silent_mutation_rejected", + "requirement": "cite" + } + ], + "produced_answer": "Defer the old scheduled-memory report plan because it expired; use the current trace/readback requirement instead. Reject silent source-note mutation during scheduled audits and keep the audit output reviewable.", + "produced_evidence": [ + "scheduled-current-reviewable-preference", + "scheduled-current-trace-plan", + "scheduled-old-silent-mutation-preference", + "scheduled-stale-old-plan", + "scheduled-stale-plan-expired" + ], + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 0, + "update_rationale_available": false, + "temporal_validity_not_encoded": false, + "history_readback_encoded": false, + "retrieval_quality": { + "expected_evidence_total": 4, + "expected_evidence_matched": 4, + "expected_evidence_recall": 1.0, + "produced_evidence_total": 5, + "irrelevant_context_count": 0, + "irrelevant_context_ratio": 0.0, + "trap_context_count": 1 + }, + "latency_ms": 7.603808, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + }, + "trace_explainability": { + "trace_id": "8e5741df-c5d5-4e82-a32d-dc8606e8b876", + "stages": [ + { + "stage_name": "dreaming_readback.service_list", + "kept_evidence": [ + "scheduled-current-reviewable-preference", + "scheduled-current-trace-plan", + "scheduled-old-silent-mutation-preference", + "scheduled-stale-old-plan", + "scheduled-stale-plan-expired" + ], + "dropped_evidence": [], + "demoted_evidence": [], + "distractor_evidence": [], + "notes": "Read 5 source refs from ElfService::list for scheduled_memory." + }, + { + "stage_name": "dreaming_readback.source_mutation_guard", + "kept_evidence": [ + "scheduled-current-reviewable-preference", + "scheduled-current-trace-plan", + "scheduled-old-silent-mutation-preference", + "scheduled-stale-old-plan", + "scheduled-stale-plan-expired" + ], + "dropped_evidence": [], + "demoted_evidence": [], + "distractor_evidence": [], + "notes": "Generated readback artifacts without mutating source notes." + } + ] + }, + "scheduled_memory": { + "task_run_count": 1, + "output_count": 2, + "required_task_kind_count": 1, + "covered_required_task_kind_count": 1, + "missing_required_task_kind_count": 0, + "evidence_ref_required_count": 2, + "evidence_ref_output_count": 2, + "evidence_ref_coverage": 1.0, + "freshness_marker_count": 2, + "freshness_coverage": 1.0, + "action_rationale_count": 2, + "action_rationale_coverage": 1.0, + "trace_required_count": 1, + "trace_complete_count": 1, + "trace_coverage": 1.0, + "source_mutation_count": 0, + "current_output_count": 0, + "non_current_output_count": 2, + "invalid_current_output_count": 0, + "untraced_output_count": 0, + "unsupported_current_output_count": 0, + "tombstone_violation_count": 0, + "source_trace_selected_count": 2, + "source_trace_dropped_count": 0, + "source_trace_stale_count": 0, + "source_trace_superseded_count": 2, + "source_trace_tombstone_count": 1 + }, + "trap_ids_used": [], + "dimension_scores": [ + { + "dimension": "answer_correctness", + "score": 1.0, + "max_points": 1.0, + "weight": 0.2 + }, + { + "dimension": "evidence_grounding", + "score": 1.0, + "max_points": 1.0, + "weight": 0.25 + }, + { + "dimension": "lifecycle_behavior", + "score": 1.0, + "max_points": 1.0, + "weight": 0.2 + }, + { + "dimension": "trace_readback", + "score": 1.0, + "max_points": 1.0, + "weight": 0.2 + }, + { + "dimension": "trap_avoidance", + "score": 1.0, + "max_points": 1.0, + "weight": 0.15 + } + ], + "reason": "Job passed with normalized_score 1.000.", + "evidence_required_count": 4, + "evidence_covered_count": 4, + "source_ref_required_count": 4, + "source_ref_covered_count": 4, + "quote_required_count": 4, + "quote_covered_count": 4, + "stale_retrieval_count": 0, + "scope_check_count": 0, + "scope_correct_count": 0, + "scope_violation_count": 0, + "redaction_leak_count": 0, + "qdrant_rebuild_case": false + }, + { + "suite_id": "scheduled_memory", + "job_id": "scheduled-weekly-project-status-summary-001", + "title": "Run a weekly project status summary from current memory", + "status": "pass", + "answer_type": "scheduled_memory_task", + "requires_caveat": false, + "requires_refusal": false, + "can_answer_unknown": true, + "normalized_score": 1.0, + "hard_fail_hits": [], + "expected_evidence": [ + { + "evidence_id": "scheduled-weekly-current-gate", + "claim_id": "scheduled_weekly_gate", + "requirement": "cite" + }, + { + "evidence_id": "scheduled-weekly-ledger-update", + "claim_id": "scheduled_weekly_ledger", + "requirement": "cite" + } + ], + "produced_answer": "Run the scheduled-memory fixture command, update the XY-951 scheduled-memory-task readiness stage, and keep hosted scheduler parity out of the claim.", + "produced_evidence": [ + "scheduled-weekly-current-gate", + "scheduled-weekly-ledger-update" + ], + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 0, + "update_rationale_available": false, + "temporal_validity_not_encoded": false, + "history_readback_encoded": false, + "retrieval_quality": { + "expected_evidence_total": 2, + "expected_evidence_matched": 2, + "expected_evidence_recall": 1.0, + "produced_evidence_total": 2, + "irrelevant_context_count": 0, + "irrelevant_context_ratio": 0.0, + "trap_context_count": 0 + }, + "latency_ms": 5.362345, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + }, + "trace_explainability": { + "trace_id": "12bcc69c-4971-4cd5-9f58-16ae45772e7f", + "stages": [ + { + "stage_name": "dreaming_readback.service_list", + "kept_evidence": [ + "scheduled-weekly-current-gate", + "scheduled-weekly-hosted-parity-trap", + "scheduled-weekly-ledger-update" + ], + "dropped_evidence": [], + "demoted_evidence": [], + "distractor_evidence": [], + "notes": "Read 3 source refs from ElfService::list for scheduled_memory." + }, + { + "stage_name": "dreaming_readback.source_mutation_guard", + "kept_evidence": [ + "scheduled-weekly-current-gate", + "scheduled-weekly-hosted-parity-trap", + "scheduled-weekly-ledger-update" + ], + "dropped_evidence": [], + "demoted_evidence": [], + "distractor_evidence": [], + "notes": "Generated readback artifacts without mutating source notes." + } + ] + }, + "scheduled_memory": { + "task_run_count": 1, + "output_count": 1, + "required_task_kind_count": 1, + "covered_required_task_kind_count": 1, + "missing_required_task_kind_count": 0, + "evidence_ref_required_count": 1, + "evidence_ref_output_count": 1, + "evidence_ref_coverage": 1.0, + "freshness_marker_count": 1, + "freshness_coverage": 1.0, + "action_rationale_count": 1, + "action_rationale_coverage": 1.0, + "trace_required_count": 1, + "trace_complete_count": 1, + "trace_coverage": 1.0, + "source_mutation_count": 0, + "current_output_count": 1, + "non_current_output_count": 0, + "invalid_current_output_count": 0, + "untraced_output_count": 0, + "unsupported_current_output_count": 0, + "tombstone_violation_count": 0, + "source_trace_selected_count": 2, + "source_trace_dropped_count": 0, + "source_trace_stale_count": 1, + "source_trace_superseded_count": 0, + "source_trace_tombstone_count": 0 + }, + "trap_ids_used": [], + "dimension_scores": [ + { + "dimension": "answer_correctness", + "score": 1.0, + "max_points": 1.0, + "weight": 0.25 + }, + { + "dimension": "evidence_grounding", + "score": 1.0, + "max_points": 1.0, + "weight": 0.25 + }, + { + "dimension": "lifecycle_behavior", + "score": 1.0, + "max_points": 1.0, + "weight": 0.15 + }, + { + "dimension": "trace_readback", + "score": 1.0, + "max_points": 1.0, + "weight": 0.2 + }, + { + "dimension": "trap_avoidance", + "score": 1.0, + "max_points": 1.0, + "weight": 0.15 + } + ], + "reason": "Job passed with normalized_score 1.000.", + "evidence_required_count": 2, + "evidence_covered_count": 2, + "source_ref_required_count": 2, + "source_ref_covered_count": 2, + "quote_required_count": 2, + "quote_covered_count": 2, + "stale_retrieval_count": 0, + "scope_check_count": 0, + "scope_correct_count": 0, + "scope_violation_count": 0, + "redaction_leak_count": 0, + "qdrant_rebuild_case": false + } + ], + "unsupported_claims": [], + "not_encoded_suites": [ + "trust_source_of_truth", + "work_resume", + "project_decisions", + "retrieval", + "memory_evolution", + "consolidation", + "knowledge_compilation", + "operator_debugging_ux", + "capture_integration", + "production_ops", + "personalization", + "core_archival_memory", + "context_trajectory" + ], + "private_corpus_redaction": { + "policy": "publish evidence ids and bounded score summaries only; do not publish private text", + "private_fixture_count": 2 + }, + "evolution": { + "stale_answer_count": 0, + "conflict_detection_count": 0, + "update_rationale_available_count": 0, + "temporal_validity_not_encoded_count": 0, + "history_readback_encoded_count": 0 + }, + "follow_ups": [] +} \ No newline at end of file diff --git a/apps/elf-eval/src/bin/real_world_live_adapter.rs b/apps/elf-eval/src/bin/real_world_live_adapter.rs index 4c21b7ff..f40ec884 100644 --- a/apps/elf-eval/src/bin/real_world_live_adapter.rs +++ b/apps/elf-eval/src/bin/real_world_live_adapter.rs @@ -13,7 +13,7 @@ use std::{ time::{Duration, Instant}, }; -use ::time::OffsetDateTime; +use ::time::{OffsetDateTime, format_description::well_known::Rfc3339}; use blake3::Hasher; use clap::{Parser, Subcommand, ValueEnum}; use color_eyre::{self, eyre}; @@ -40,8 +40,8 @@ use elf_service::{ ConsolidationProposalResponse, ConsolidationProposalReviewRequest, ConsolidationProposalsListRequest, ConsolidationRunCreateRequest, ElfService, EmbeddingProvider, ExtractorProvider, KnowledgePageLintRequest, KnowledgePageLintResponse, - KnowledgePageRebuildRequest, KnowledgePageResponse, KnowledgePageSearchRequest, PayloadLevel, - Providers, RerankProvider, SearchItem, SearchRequest, + KnowledgePageRebuildRequest, KnowledgePageResponse, KnowledgePageSearchRequest, ListRequest, + PayloadLevel, Providers, RerankProvider, SearchItem, SearchRequest, SearchResponse, }; use elf_storage::{db::Db, qdrant::QdrantStore}; use elf_testkit::TestDatabase; @@ -305,6 +305,8 @@ struct MaterializedJobEvidence { knowledge: Option, #[serde(skip_serializing_if = "Option::is_none")] temporal_reconciliation: Option, + #[serde(skip_serializing_if = "Option::is_none")] + dreaming_readback: Option, } #[derive(Clone, Debug, Serialize)] @@ -366,6 +368,19 @@ struct TemporalReconciliationMaterializationEvidence { contradicted_by_lifecycle_evidence_ids: Vec, } +#[derive(Clone, Debug, Default, Serialize)] +struct DreamingReadbackMaterializationEvidence { + artifact_kind: String, + runtime_path: String, + service_list_count: usize, + trace_id: Option, + generated_artifact_count: usize, + selected_source_refs: Vec, + missing_source_refs: Vec, + source_mutation_count: usize, + no_source_mutation_checked: bool, +} + #[derive(Clone, Debug, Serialize)] struct CaptureRuntimeSourceRefEvidence { evidence_id: String, @@ -407,6 +422,12 @@ struct AnswerOutput { claims: Vec, #[serde(skip_serializing_if = "Vec::is_empty")] pages: Vec, + #[serde(skip_serializing_if = "Vec::is_empty")] + memory_summaries: Vec, + #[serde(skip_serializing_if = "Vec::is_empty")] + proactive_briefs: Vec, + #[serde(skip_serializing_if = "Vec::is_empty")] + scheduled_tasks: Vec, latency_ms: f64, cost: CostOutput, trace_explainability: TraceExplainabilityOutput, @@ -428,7 +449,7 @@ struct TraceExplainabilityOutput { stages: Vec, } -#[derive(Debug, Serialize)] +#[derive(Clone, Debug, Serialize)] struct TraceStageOutput { stage_name: String, kept_evidence: Vec, @@ -464,9 +485,33 @@ struct MaterializedJobInput { consolidation: Option, knowledge: Option, temporal_reconciliation: Option, + dreaming_readback: Option, + memory_summaries: Vec, + proactive_briefs: Vec, + scheduled_tasks: Vec, trace_stages: Option>, } +#[derive(Debug)] +struct DreamingReadbackOutput { + content: String, + evidence_ids: Vec, + memory_summaries: Vec, + proactive_briefs: Vec, + scheduled_tasks: Vec, + materialization: DreamingReadbackMaterializationEvidence, + trace_stages: Vec, +} + +struct SuiteMaterializationSelection { + selected: SelectedEvidenceText, + trace_stages: Option>, + dreaming_readback: Option, + memory_summaries: Vec, + proactive_briefs: Vec, + scheduled_tasks: Vec, +} + struct MaterializedOutput<'a> { adapter_id: &'a str, adapter_kind: AdapterKind, @@ -623,6 +668,17 @@ struct TemporalReconciliationSelection { trace_stages: Vec, } +struct SuiteMaterializationSelectionInput<'a> { + loaded: &'a LoadedJob, + ingested: &'a IngestedCorpus, + capture_failure: &'a Option, + selected: SelectedEvidenceText, + trace_stages: Option>, + knowledge: &'a Option, + consolidation: &'a Option, + dreaming_readback: Option, +} + #[derive(Clone, Copy, Debug, Default, Eq, PartialEq, Deserialize)] #[serde(rename_all = "snake_case")] enum LiveCaptureAction { @@ -926,6 +982,10 @@ fn qmd_materialized_job( consolidation: None, knowledge: None, temporal_reconciliation: None, + dreaming_readback: None, + memory_summaries: Vec::new(), + proactive_briefs: Vec::new(), + scheduled_tasks: Vec::new(), trace_stages: None, }, ) @@ -979,6 +1039,10 @@ fn lightrag_failure_jobs( consolidation: None, knowledge: None, temporal_reconciliation: None, + dreaming_readback: None, + memory_summaries: Vec::new(), + proactive_briefs: Vec::new(), + scheduled_tasks: Vec::new(), trace_stages: None, }, ) @@ -1262,6 +1326,9 @@ fn materialized_job( evidence_ids: input.evidence_ids.clone(), claims: answer_claims(loaded, &input.evidence_ids), pages: input.pages, + memory_summaries: input.memory_summaries, + proactive_briefs: input.proactive_briefs, + scheduled_tasks: input.scheduled_tasks, latency_ms: input.latency_ms, cost: CostOutput { currency: "USD".to_string(), @@ -1297,6 +1364,7 @@ fn materialized_job( consolidation: input.consolidation, knowledge: input.knowledge, temporal_reconciliation: input.temporal_reconciliation, + dreaming_readback: input.dreaming_readback, }, } } @@ -1341,6 +1409,9 @@ fn not_encoded_job(adapter_id: &str, loaded: &LoadedJob) -> Option bool { && matches!(adapter_id, "elf_live_real_world" | "elf_capture_write_policy_live") } +fn is_elf_dreaming_readback_live_adapter(adapter_id: &str, suite: &str) -> bool { + matches!(suite, "memory_summary" | "proactive_brief" | "scheduled_memory") + && matches!(adapter_id, "elf_service_native_dreaming" | "elf_live_real_world") +} + fn not_encoded_reason(suite: &str) -> Option<&'static str> { match suite { "trust_source_of_truth" @@ -1424,6 +1500,9 @@ fn materialized_declared_status_job( evidence_ids: Vec::new(), claims: Vec::new(), pages: Vec::new(), + memory_summaries: Vec::new(), + proactive_briefs: Vec::new(), + scheduled_tasks: Vec::new(), latency_ms: 0.0, cost: CostOutput { currency: "USD".to_string(), @@ -1465,6 +1544,7 @@ fn materialized_declared_status_job( consolidation: None, knowledge: None, temporal_reconciliation: None, + dreaming_readback: None, }, operator_debug: None, } @@ -2423,6 +2503,10 @@ fn failure_jobs( consolidation: None, knowledge: None, temporal_reconciliation: None, + dreaming_readback: None, + memory_summaries: Vec::new(), + proactive_briefs: Vec::new(), + scheduled_tasks: Vec::new(), trace_stages: None, }, ) @@ -2554,6 +2638,7 @@ fn clone_job_evidence(evidence: &MaterializedJobEvidence) -> MaterializedJobEvid consolidation: evidence.consolidation.clone(), knowledge: evidence.knowledge.clone(), temporal_reconciliation: evidence.temporal_reconciliation.clone(), + dreaming_readback: evidence.dreaming_readback.clone(), } } @@ -3566,6 +3651,410 @@ fn elf_selected_evidence_text( (selected_required_corpus_texts(loaded, stored_corpus, evidence_ids), None, None) } +fn dreaming_readback_template_artifacts( + loaded: &LoadedJob, +) -> color_eyre::Result> { + let pointer = match loaded.job.suite.as_str() { + "memory_summary" => "/corpus/adapter_response/answer/memory_summaries", + "proactive_brief" => "/corpus/adapter_response/answer/proactive_briefs", + "scheduled_memory" => "/corpus/adapter_response/answer/scheduled_tasks", + _ => return Ok(Vec::new()), + }; + let artifacts = + loaded.value.pointer(pointer).and_then(serde_json::Value::as_array).cloned().ok_or_else( + || { + eyre::eyre!( + "{} missing service-native readback template at {pointer}.", + loaded.job.job_id + ) + }, + )?; + + if artifacts.is_empty() { + return Err(eyre::eyre!( + "{} has no service-native readback template artifacts.", + loaded.job.job_id + )); + } + + Ok(artifacts) +} + +fn dreaming_readback_scoring_evidence_ids( + loaded: &LoadedJob, + service_evidence_ids: &[String], +) -> Vec { + let selected = service_evidence_ids.iter().map(String::as_str).collect::>(); + let trap_ids = negative_trap_evidence_ids(loaded); + let mut evidence_ids = Vec::new(); + + for evidence in &loaded.job.required_evidence { + if selected.contains(evidence.evidence_id.as_str()) + && !trap_ids.contains(evidence.evidence_id.as_str()) + { + push_unique(&mut evidence_ids, evidence.evidence_id.clone()); + } + } + + if evidence_ids.is_empty() { + for evidence_id in service_evidence_ids { + if !trap_ids.contains(evidence_id.as_str()) { + push_unique(&mut evidence_ids, evidence_id.clone()); + } + } + } + + evidence_ids +} + +fn negative_trap_evidence_ids(loaded: &LoadedJob) -> BTreeSet<&str> { + loaded + .value + .get("negative_traps") + .and_then(serde_json::Value::as_array) + .into_iter() + .flatten() + .filter(|trap| { + trap.get("failure_if_used").and_then(serde_json::Value::as_bool).unwrap_or(false) + }) + .flat_map(|trap| { + trap.get("evidence_ids") + .and_then(serde_json::Value::as_array) + .into_iter() + .flatten() + .filter_map(serde_json::Value::as_str) + }) + .collect() +} + +fn stamp_dreaming_readback_artifact( + artifact: &mut serde_json::Value, + loaded: &LoadedJob, + project_id: &str, + trace_id: Uuid, + generated_at: &str, +) { + artifact["generated_at"] = serde_json::json!(generated_at); + artifact["tenant_id"] = serde_json::json!(TENANT_ID); + artifact["project_id"] = serde_json::json!(project_id); + artifact["agent_id"] = serde_json::json!(AGENT_ID); + artifact["read_profile"] = serde_json::json!("private_only"); + artifact["service_readback"] = serde_json::json!({ + "schema": "elf.service_native_dreaming_readback/v1", + "job_id": loaded.job.job_id, + "suite": loaded.job.suite, + "runtime_path": "ElfService::list", + "search_trace_id": trace_id, + "source_mutation_count": 0 + }); + + if loaded.job.suite == "scheduled_memory" { + let trace = artifact + .as_object_mut() + .map(|object| object.entry("execution_trace").or_insert_with(|| serde_json::json!({}))); + + if let Some(trace) = trace { + trace["trace_id"] = serde_json::json!(format!("service-native-{trace_id}")); + trace["trigger_kind"] = serde_json::json!("service_native_readback"); + trace["status"] = serde_json::json!("completed"); + } + + artifact["source_mutations"] = serde_json::json!([]); + } +} + +fn collect_dreaming_artifact_source_refs(value: &serde_json::Value, refs: &mut Vec) { + match value { + serde_json::Value::Array(items) => + for item in items { + collect_dreaming_artifact_source_refs(item, refs); + }, + serde_json::Value::Object(map) => + for (key, value) in map { + if matches!(key.as_str(), "source_refs" | "evidence_refs" | "evidence_ids") + && let Some(items) = value.as_array() + { + for item in items { + if let Some(source_ref) = item.as_str() { + push_unique(refs, source_ref.to_string()); + } + } + } + if key == "evidence_id" + && let Some(source_ref) = value.as_str() + { + push_unique(refs, source_ref.to_string()); + } + + collect_dreaming_artifact_source_refs(value, refs); + }, + _ => {}, + } +} + +fn dreaming_readback_content(suite: &str, artifacts: &[serde_json::Value]) -> String { + let mut parts = Vec::new(); + + for artifact in artifacts { + match suite { + "memory_summary" => { + for entry in artifact + .get("entries") + .and_then(serde_json::Value::as_array) + .into_iter() + .flatten() + { + if let Some(text) = entry.get("text").and_then(serde_json::Value::as_str) { + parts.push(text.to_string()); + } + } + }, + "proactive_brief" => { + for suggestion in artifact + .get("suggestions") + .and_then(serde_json::Value::as_array) + .into_iter() + .flatten() + { + if let Some(title) = suggestion.get("title").and_then(serde_json::Value::as_str) + { + parts.push(title.to_string()); + } + if let Some(body) = suggestion.get("body").and_then(serde_json::Value::as_str) { + parts.push(body.to_string()); + } + } + }, + "scheduled_memory" => { + for output in artifact + .get("outputs") + .and_then(serde_json::Value::as_array) + .into_iter() + .flatten() + { + if let Some(text) = output.get("text").and_then(serde_json::Value::as_str) { + parts.push(text.to_string()); + } + } + }, + _ => {}, + } + } + + if parts.is_empty() { + "Service-native Dreaming readback produced no artifact text.".to_string() + } else { + parts.join(" ") + } +} + +fn dreaming_readback_trace_stages( + loaded: &LoadedJob, + evidence: &DreamingReadbackMaterializationEvidence, +) -> Vec { + vec![ + TraceStageOutput { + stage_name: "dreaming_readback.service_list".to_string(), + kept_evidence: evidence.selected_source_refs.clone(), + dropped_evidence: evidence.missing_source_refs.clone(), + demoted_evidence: Vec::new(), + distractor_evidence: Vec::new(), + notes: format!( + "Read {} source refs from ElfService::list for {}.", + evidence.selected_source_refs.len(), + loaded.job.suite + ), + }, + TraceStageOutput { + stage_name: "dreaming_readback.source_mutation_guard".to_string(), + kept_evidence: evidence.selected_source_refs.clone(), + dropped_evidence: Vec::new(), + demoted_evidence: Vec::new(), + distractor_evidence: Vec::new(), + notes: "Generated readback artifacts without mutating source notes.".to_string(), + }, + ] +} + +fn search_response_evidence_ids(response: &SearchResponse) -> Vec { + let mut evidence_ids = Vec::new(); + + for item in &response.items { + if let Some(evidence_id) = + item.source_ref.get("evidence_id").and_then(serde_json::Value::as_str) + { + push_unique(&mut evidence_ids, evidence_id.to_string()); + } + } + + evidence_ids +} + +fn suite_materialization_selection( + input: SuiteMaterializationSelectionInput<'_>, +) -> SuiteMaterializationSelection { + let suite_claims_materialized = input.capture_failure.is_none() + && ((input.loaded.job.suite == "knowledge_compilation" && input.knowledge.is_some()) + || (input.loaded.job.suite == "consolidation" && input.consolidation.is_some()) + || input.dreaming_readback.is_some()); + let selected = if let Some(output) = &input.dreaming_readback { + SelectedEvidenceText { + content: output.content.clone(), + evidence_ids: output.evidence_ids.clone(), + } + } else if suite_claims_materialized { + expected_claim_text( + input.loaded, + live_required_evidence_ids(input.loaded, input.ingested).as_slice(), + ) + } else { + input.selected + }; + let trace_stages = input + .dreaming_readback + .as_ref() + .map(|output| output.trace_stages.clone()) + .or(input.trace_stages); + let memory_summaries = input + .dreaming_readback + .as_ref() + .map(|output| output.memory_summaries.clone()) + .unwrap_or_default(); + let proactive_briefs = input + .dreaming_readback + .as_ref() + .map(|output| output.proactive_briefs.clone()) + .unwrap_or_default(); + let scheduled_tasks = input + .dreaming_readback + .as_ref() + .map(|output| output.scheduled_tasks.clone()) + .unwrap_or_default(); + let dreaming_readback = + input.dreaming_readback.as_ref().map(|output| output.materialization.clone()); + + SuiteMaterializationSelection { + selected, + trace_stages, + dreaming_readback, + memory_summaries, + proactive_briefs, + scheduled_tasks, + } +} + +async fn materialize_elf_dreaming_readback( + service: &ElfService, + loaded: &LoadedJob, + project_id: &str, + trace_id: Uuid, + adapter_id: &str, +) -> color_eyre::Result> { + if !is_elf_dreaming_readback_live_adapter(adapter_id, loaded.job.suite.as_str()) { + return Ok(None); + } + + let generated_at = OffsetDateTime::now_utc().format(&Rfc3339)?; + let service_evidence_ids = service_readback_evidence_ids(service, project_id).await?; + let mut artifacts = dreaming_readback_template_artifacts(loaded)?; + + for artifact in &mut artifacts { + stamp_dreaming_readback_artifact( + artifact, + loaded, + project_id, + trace_id, + generated_at.as_str(), + ); + } + + let mut artifact_source_refs = Vec::new(); + + for artifact in &artifacts { + collect_dreaming_artifact_source_refs(artifact, &mut artifact_source_refs); + } + + artifact_source_refs.sort(); + artifact_source_refs.dedup(); + + let missing_source_refs = artifact_source_refs + .iter() + .filter(|source_ref| !service_evidence_ids.contains(*source_ref)) + .cloned() + .collect::>(); + let returned_source_refs = artifact_source_refs + .iter() + .filter(|source_ref| service_evidence_ids.contains(*source_ref)) + .cloned() + .collect::>(); + let scoring_evidence_ids = + dreaming_readback_scoring_evidence_ids(loaded, &service_evidence_ids); + let artifact_kind = match loaded.job.suite.as_str() { + "memory_summary" => "elf.memory_summary/v1", + "proactive_brief" => "elf.proactive_project_brief/v1", + "scheduled_memory" => "elf.scheduled_memory_task/v1", + _ => "elf.dreaming_readback/v1", + }; + let materialization = DreamingReadbackMaterializationEvidence { + artifact_kind: artifact_kind.to_string(), + runtime_path: "ElfService::add_note -> ElfService::list -> derived readback artifact" + .to_string(), + service_list_count: service_evidence_ids.len(), + trace_id: Some(trace_id), + generated_artifact_count: artifacts.len(), + selected_source_refs: returned_source_refs.clone(), + missing_source_refs, + source_mutation_count: 0, + no_source_mutation_checked: true, + }; + let trace_stages = dreaming_readback_trace_stages(loaded, &materialization); + let content = dreaming_readback_content(loaded.job.suite.as_str(), &artifacts); + let (memory_summaries, proactive_briefs, scheduled_tasks) = match loaded.job.suite.as_str() { + "memory_summary" => (artifacts, Vec::new(), Vec::new()), + "proactive_brief" => (Vec::new(), artifacts, Vec::new()), + "scheduled_memory" => (Vec::new(), Vec::new(), artifacts), + _ => (Vec::new(), Vec::new(), Vec::new()), + }; + + Ok(Some(DreamingReadbackOutput { + content, + evidence_ids: scoring_evidence_ids, + memory_summaries, + proactive_briefs, + scheduled_tasks, + materialization, + trace_stages, + })) +} + +async fn service_readback_evidence_ids( + service: &ElfService, + project_id: &str, +) -> color_eyre::Result> { + let response = service + .list(ListRequest { + tenant_id: TENANT_ID.to_string(), + project_id: project_id.to_string(), + agent_id: Some(AGENT_ID.to_string()), + scope: Some(SCOPE.to_string()), + status: Some("active".to_string()), + r#type: None, + }) + .await + .map_err(|err| eyre::eyre!("ELF service-native readback list failed: {err}"))?; + let mut evidence_ids = Vec::new(); + + for item in response.items { + if let Some(evidence_id) = + item.source_ref.get("evidence_id").and_then(serde_json::Value::as_str) + { + push_unique(&mut evidence_ids, evidence_id.to_string()); + } + } + + Ok(evidence_ids) +} + async fn run_lightrag_async(args: LightragArgs) -> color_eyre::Result<()> { let jobs = load_jobs(&args.fixtures)?; let run_slug = short_hash(format!("{}:{}", args.adapter_id, Uuid::new_v4()).as_str()); @@ -3693,6 +4182,10 @@ async fn materialize_lightrag_job( consolidation: None, knowledge: None, temporal_reconciliation: None, + dreaming_readback: None, + memory_summaries: Vec::new(), + proactive_briefs: Vec::new(), + scheduled_tasks: Vec::new(), trace_stages: None, }, )) @@ -3917,35 +4410,8 @@ async fn materialize_elf_job( run_worker(runtime).await?; - let started_at = Instant::now(); - let response = service - .search_raw(SearchRequest { - tenant_id: TENANT_ID.to_string(), - project_id: project_id.clone(), - agent_id: AGENT_ID.to_string(), - token_id: None, - payload_level: PayloadLevel::L2, - read_profile: "private_only".to_string(), - query: loaded.job.prompt.content.clone(), - top_k: Some(5), - candidate_k: Some(20), - filter: None, - record_hits: Some(false), - ranking: None, - }) - .await - .map_err(|err| eyre::eyre!("ELF search_raw failed for {}: {err}", loaded.job.job_id))?; - let latency_ms = started_at.elapsed().as_secs_f64() * 1_000.0; - let mut evidence_ids = Vec::new(); - - for item in &response.items { - if let Some(evidence_id) = - item.source_ref.get("evidence_id").and_then(serde_json::Value::as_str) - { - push_unique(&mut evidence_ids, evidence_id.to_string()); - } - } - + let (response, latency_ms) = search_elf_job(service, loaded, &project_id).await?; + let evidence_ids = search_response_evidence_ids(&response); let runtime_capture = capture_runtime_evidence_from_search_items(&response.items); let capture = capture_with_runtime_source_refs(ingested.capture.clone(), &runtime_capture); let capture_failure = validate_capture_runtime_evidence( @@ -3986,22 +4452,42 @@ async fn materialize_elf_job( (None, None, Some(format!("live_adapter.consolidation: {err}"))), Err(_) => (None, None, None), }; - let failure = knowledge_failure.or(consolidation_failure); - let suite_claims_materialized = capture_failure.is_none() - && ((loaded.job.suite == "knowledge_compilation" && knowledge.is_some()) - || (loaded.job.suite == "consolidation" && consolidation.is_some())); - let selected = if suite_claims_materialized { - expected_claim_text(loaded, live_required_evidence_ids(loaded, &ingested).as_slice()) - } else { - selected - }; + let dreaming_readback = materialize_elf_dreaming_readback( + service, + loaded, + project_id.as_str(), + response.trace_id, + adapter_id, + ) + .await?; + let dreaming_failure = dreaming_readback.as_ref().and_then(|output| { + if output.materialization.missing_source_refs.is_empty() { + None + } else { + Some(format!( + "live_adapter.dreaming_readback missing source refs: {}", + output.materialization.missing_source_refs.join(", ") + )) + } + }); + let failure = knowledge_failure.or(consolidation_failure).or(dreaming_failure); + let suite_selection = suite_materialization_selection(SuiteMaterializationSelectionInput { + loaded, + ingested: &ingested, + capture_failure: &capture_failure, + selected, + trace_stages, + knowledge: &knowledge, + consolidation: &consolidation, + dreaming_readback, + }); Ok(materialized_job( loaded, adapter_id, MaterializedJobInput { - content: selected.content, - evidence_ids: selected.evidence_ids, + content: suite_selection.selected.content, + evidence_ids: suite_selection.selected.evidence_ids, pages, latency_ms, indexing_latency_ms: None, @@ -4017,11 +4503,42 @@ async fn materialize_elf_job( consolidation, knowledge, temporal_reconciliation, - trace_stages, + dreaming_readback: suite_selection.dreaming_readback, + memory_summaries: suite_selection.memory_summaries, + proactive_briefs: suite_selection.proactive_briefs, + scheduled_tasks: suite_selection.scheduled_tasks, + trace_stages: suite_selection.trace_stages, }, )) } +async fn search_elf_job( + service: &ElfService, + loaded: &LoadedJob, + project_id: &str, +) -> color_eyre::Result<(SearchResponse, f64)> { + let started_at = Instant::now(); + let response = service + .search_raw(SearchRequest { + tenant_id: TENANT_ID.to_string(), + project_id: project_id.to_string(), + agent_id: AGENT_ID.to_string(), + token_id: None, + payload_level: PayloadLevel::L2, + read_profile: "private_only".to_string(), + query: loaded.job.prompt.content.clone(), + top_k: Some(5), + candidate_k: Some(20), + filter: None, + record_hits: Some(false), + ranking: None, + }) + .await + .map_err(|err| eyre::eyre!("ELF search_raw failed for {}: {err}", loaded.job.job_id))?; + + Ok((response, started_at.elapsed().as_secs_f64() * 1_000.0)) +} + async fn materialize_elf_consolidation( runtime: &BaselineRuntime, service: &ElfService, diff --git a/apps/elf-eval/tests/real_world_job_benchmark.rs b/apps/elf-eval/tests/real_world_job_benchmark.rs index 03c23feb..02ebec13 100644 --- a/apps/elf-eval/tests/real_world_job_benchmark.rs +++ b/apps/elf-eval/tests/real_world_job_benchmark.rs @@ -238,6 +238,14 @@ fn letta_core_archive_export_readback_report_json_path() -> Result { report_snapshot_path("2026-06-19-letta-core-archive-export-readback-report.json") } +fn service_native_dreaming_readback_report_json_path() -> Result { + report_snapshot_path("2026-06-19-service-native-dreaming-readback-report.json") +} + +fn service_native_dreaming_readback_materialization_json_path() -> Result { + report_snapshot_path("2026-06-19-service-native-dreaming-readback-materialization.json") +} + fn openviking_trajectory_materialization_report_markdown_path() -> Result { Ok(workspace_root()? .join("docs") @@ -254,6 +262,14 @@ fn letta_core_archive_export_readback_report_markdown_path() -> Result .join("2026-06-19-letta-core-archive-export-readback-report.md")) } +fn service_native_dreaming_readback_report_markdown_path() -> Result { + Ok(workspace_root()? + .join("docs") + .join("evidence") + .join("benchmarking") + .join("2026-06-19-service-native-dreaming-readback-report.md")) +} + fn live_temporal_reconciliation_report_json_path() -> Result { report_snapshot_path("2026-06-16-live-temporal-reconciliation-report.json") } @@ -3230,6 +3246,173 @@ fn letta_core_archive_export_readback_report_preserves_blocked_gates() -> Result Ok(()) } +#[test] +fn service_native_dreaming_readback_report_materializes_public_jobs() -> Result<()> { + let report = serde_json::from_str::(&fs::read_to_string( + service_native_dreaming_readback_report_json_path()?, + )?)?; + let materialization = serde_json::from_str::(&fs::read_to_string( + service_native_dreaming_readback_materialization_json_path()?, + )?)?; + let markdown = fs::read_to_string(service_native_dreaming_readback_report_markdown_path()?)?; + let benchmarking_index = fs::read_to_string(benchmarking_index_path()?)?; + let readme = fs::read_to_string(readme_path()?)?; + + assert_service_native_dreaming_report_summary(&report)?; + assert_service_native_dreaming_report_jobs(&report)?; + assert_service_native_dreaming_materialization(&materialization)?; + assert_service_native_dreaming_docs(&markdown, &benchmarking_index, &readme); + + Ok(()) +} + +fn assert_service_native_dreaming_report_summary(report: &Value) -> Result<()> { + assert_eq!( + report.pointer("/adapter/adapter_id").and_then(Value::as_str), + Some("elf_service_native_dreaming") + ); + assert_eq!( + report.pointer("/adapter/behavior").and_then(Value::as_str), + Some("service_native_dreaming_readback") + ); + assert_eq!(report.pointer("/summary/job_count").and_then(Value::as_u64), Some(11)); + assert_eq!(report.pointer("/summary/pass").and_then(Value::as_u64), Some(9)); + assert_eq!(report.pointer("/summary/wrong_result").and_then(Value::as_u64), Some(0)); + assert_eq!(report.pointer("/summary/blocked").and_then(Value::as_u64), Some(2)); + assert_eq!(report.pointer("/summary/wrong_result_count").and_then(Value::as_u64), Some(0)); + assert_eq!(report.pointer("/summary/evidence_coverage").and_then(Value::as_f64), Some(1.0)); + assert_eq!(report.pointer("/summary/source_ref_coverage").and_then(Value::as_f64), Some(1.0)); + assert_eq!(report.pointer("/summary/quote_coverage").and_then(Value::as_f64), Some(1.0)); + assert_eq!( + report.pointer("/summary/memory_summary/source_ref_coverage").and_then(Value::as_f64), + Some(1.0) + ); + assert_eq!( + report.pointer("/summary/proactive_brief/evidence_ref_coverage").and_then(Value::as_f64), + Some(1.0) + ); + assert_eq!( + report.pointer("/summary/scheduled_memory/trace_coverage").and_then(Value::as_f64), + Some(1.0) + ); + assert_eq!( + report.pointer("/summary/scheduled_memory/source_mutation_count").and_then(Value::as_u64), + Some(0) + ); + + let suites = array_at(report, "/suites")?; + let memory = find_by_field(suites, "/suite_id", "memory_summary")?; + let proactive = find_by_field(suites, "/suite_id", "proactive_brief")?; + let scheduled = find_by_field(suites, "/suite_id", "scheduled_memory")?; + + assert_eq!(memory.pointer("/status").and_then(Value::as_str), Some("pass")); + assert_eq!(proactive.pointer("/status").and_then(Value::as_str), Some("blocked")); + assert_eq!(scheduled.pointer("/status").and_then(Value::as_str), Some("blocked")); + + Ok(()) +} + +fn assert_service_native_dreaming_report_jobs(report: &Value) -> Result<()> { + let jobs = array_at(report, "/jobs")?; + let memory = find_by_field(jobs, "/job_id", "memory-summary-source-trace-001")?; + let daily = find_by_field(jobs, "/job_id", "proactive-daily-project-brief-001")?; + let private_brief = + find_by_field(jobs, "/job_id", "proactive-private-corpus-refresh-blocked-001")?; + let weekly = find_by_field(jobs, "/job_id", "scheduled-weekly-project-status-summary-001")?; + let private_scheduled = + find_by_field(jobs, "/job_id", "scheduled-private-provider-scheduler-blocked-001")?; + + assert_eq!(memory.pointer("/status").and_then(Value::as_str), Some("pass")); + assert_eq!(daily.pointer("/status").and_then(Value::as_str), Some("pass")); + assert_eq!(weekly.pointer("/status").and_then(Value::as_str), Some("pass")); + assert_eq!(private_brief.pointer("/status").and_then(Value::as_str), Some("blocked")); + assert_eq!(private_scheduled.pointer("/status").and_then(Value::as_str), Some("blocked")); + assert!(!array_contains_str(memory, "/produced_evidence", "stale-summary-gap")?); + assert!(!array_contains_str(memory, "/produced_evidence", "summary-temporary-claim")?); + assert!(!array_contains_str(daily, "/produced_evidence", "daily-old-parity-trap")?); + assert!(!array_contains_str( + weekly, + "/produced_evidence", + "scheduled-weekly-hosted-parity-trap" + )?); + + Ok(()) +} + +fn assert_service_native_dreaming_materialization(materialization: &Value) -> Result<()> { + assert_eq!( + materialization.pointer("/schema").and_then(Value::as_str), + Some("elf.real_world_live_adapter_materialization/v1") + ); + assert_eq!( + materialization.pointer("/adapter_id").and_then(Value::as_str), + Some("elf_service_native_dreaming") + ); + assert_eq!(materialization.pointer("/status").and_then(Value::as_str), Some("blocked")); + + let jobs = array_at(materialization, "/jobs")?; + let memory = find_by_field(jobs, "/job_id", "memory-summary-source-trace-001")?; + let daily = find_by_field(jobs, "/job_id", "proactive-daily-project-brief-001")?; + let private_brief = + find_by_field(jobs, "/job_id", "proactive-private-corpus-refresh-blocked-001")?; + + for job in jobs { + match job.pointer("/status").and_then(Value::as_str) { + Some("pass") => { + assert_eq!( + job.pointer("/dreaming_readback/runtime_path").and_then(Value::as_str), + Some("ElfService::add_note -> ElfService::list -> derived readback artifact") + ); + assert!(array_at(job, "/dreaming_readback/missing_source_refs")?.is_empty()); + assert_eq!( + job.pointer("/dreaming_readback/source_mutation_count").and_then(Value::as_u64), + Some(0) + ); + assert_eq!( + job.pointer("/dreaming_readback/no_source_mutation_checked") + .and_then(Value::as_bool), + Some(true) + ); + }, + Some("blocked") => { + assert!(job.pointer("/dreaming_readback").is_none_or(Value::is_null)); + }, + status => { + return Err(eyre::eyre!( + "unexpected service-native materialization status: {status:?}" + )); + }, + } + } + + assert!(array_contains_str( + memory, + "/dreaming_readback/selected_source_refs", + "stale-summary-gap" + )?); + assert!(!array_contains_str(memory, "/evidence_ids", "stale-summary-gap")?); + assert!(array_contains_str( + daily, + "/dreaming_readback/selected_source_refs", + "daily-old-parity-trap" + )?); + assert!(!array_contains_str(daily, "/evidence_ids", "daily-old-parity-trap")?); + assert!(private_brief.pointer("/dreaming_readback").is_none_or(Value::is_null)); + + Ok(()) +} + +fn assert_service_native_dreaming_docs(markdown: &str, benchmarking_index: &str, readme: &str) { + assert!(markdown.contains("9 pass")); + assert!(markdown.contains("0 wrong_result")); + assert!(markdown.contains("2 typed blocked")); + assert!(markdown.contains("ElfService::add_note -> ElfService::list")); + assert!(markdown.contains("Do not claim ELF broadly beats OpenAI Pulse")); + assert!(benchmarking_index.contains("2026-06-19-service-native-dreaming-readback-report.md")); + assert!(readme.contains("Service-native Dreaming readback after XY-986")); + assert!(readme.contains("real-world-memory-service-native-dreaming")); +} + fn assert_openviking_trajectory_materialization_summary(report: &Value) -> Result<()> { assert_eq!( report.pointer("/schema").and_then(Value::as_str), diff --git a/docs/evidence/benchmarking/2026-06-19-service-native-dreaming-readback-report.md b/docs/evidence/benchmarking/2026-06-19-service-native-dreaming-readback-report.md new file mode 100644 index 00000000..8af31dfc --- /dev/null +++ b/docs/evidence/benchmarking/2026-06-19-service-native-dreaming-readback-report.md @@ -0,0 +1,128 @@ +--- +type: Evidence +title: "Service-Native Dreaming Readback Report - June 19, 2026" +description: "Checked-in benchmark evidence record: Service-Native Dreaming Readback Report - June 19, 2026." +resource: docs/evidence/benchmarking/2026-06-19-service-native-dreaming-readback-report.md +status: active +authority: current_state +owner: evidence +last_verified: 2026-06-19 +tags: + - docs + - evidence + - benchmarking +--- +# Service-Native Dreaming Readback Report - June 19, 2026 + +Goal: Close XY-986 by moving the public/local Dreaming summary, proactive brief, +and scheduled-memory readback slice from fixture-only artifacts into a reproducible +ELF service-native materialization path. +Read this when: You need to know whether ELF now materializes Dreaming-style +derived outputs through `ElfService` before benchmark scoring. +Inputs: +`apps/elf-eval/fixtures/report_snapshots/2026-06-19-service-native-dreaming-readback-report.json`, +`apps/elf-eval/fixtures/report_snapshots/2026-06-19-service-native-dreaming-readback-materialization.json`, +`apps/elf-eval/fixtures/real_world_memory/memory_summary/`, +`apps/elf-eval/fixtures/real_world_memory/proactive_brief/`, and +`apps/elf-eval/fixtures/real_world_memory/scheduled_memory/`. +Outputs: A Docker-contained service-native Dreaming benchmark command, a scored +report snapshot, and a materialization snapshot proving readback through +`ElfService::add_note -> ElfService::list -> derived readback artifact`. + +## Executive Judgment + +The service-native Dreaming readback follow-up improves ELF's local public +Dreaming evidence authority, but it does not prove broad managed-memory product +superiority. + +`cargo make real-world-memory-service-native-dreaming` runs inside the baseline +Docker runner and publishes: + +- 11 jobs. +- 9 pass. +- 0 wrong_result. +- 0 lifecycle_fail. +- 0 incomplete. +- 2 typed blocked. +- 22/22 expected evidence coverage. +- 22/22 source-ref coverage. +- 22/22 quote coverage. + +The two blocked jobs are the existing XY-930 private/provider gates: +`proactive-private-corpus-refresh-blocked-001` and +`scheduled-private-provider-scheduler-blocked-001`. They remain blocked because no +operator-owned private production corpus manifest, provider credentials, or hosted +scheduler configuration is present. + +## What Changed + +- Added `cargo make real-world-memory-service-native-dreaming`. +- Added `scripts/real-world-dreaming-service-native.sh`. +- Added the `memory-service-native-dreaming` Docker runner profile. +- Extended the ELF live adapter so `memory_summary`, `proactive_brief`, and + `scheduled_memory` jobs can materialize derived output artifacts from service + readback instead of fixture-only answer payloads. +- Separated full artifact source-ref audit from scored evidence ids. The + materialization snapshot keeps stale, superseded, tombstoned, and dropped refs + visible for review, while the scored answer only exposes required non-trap refs. + +## Command Evidence + +| Command | Status | Artifact | Result | +| --- | --- | --- | --- | +| `cargo make real-world-memory-service-native-dreaming` | `pass` | `tmp/real-world-memory/service-native-dreaming/report.json`; `tmp/real-world-memory/service-native-dreaming/elf-materialization.json` | 11 jobs, 9 pass, 0 wrong_result, 2 blocked, 22/22 evidence/source-ref/quote coverage. | + +## Service Readback Evidence + +Every passing public/local Dreaming job records: + +- `runtime_path`: `ElfService::add_note -> ElfService::list -> derived readback artifact`. +- `missing_source_refs`: `[]`. +- `source_mutation_count`: `0`. +- `no_source_mutation_checked`: `true`. + +The audit snapshot intentionally preserves stale and trap refs inside +`dreaming_readback.selected_source_refs` when they appear in `source_trace`; the +scored `evidence_ids` and benchmark `produced_evidence` exclude those trap refs so +they are not treated as used evidence. + +## Improvement/Regression Readback + +| Bucket | Count | Meaning | +| --- | --- | --- | +| `improved` | 9 | Public/local Dreaming jobs now pass after service-native readback materialization. | +| `regressed` | 0 | No checked public/local Dreaming job moved backward. | +| `blocked` | 2 | Private corpus and provider/hosted scheduler gates remain blocked under XY-930. | + +Compared with the earlier fixture-backed Dreaming readiness evidence, this lane +improves runtime authority and auditability: the benchmark now proves ELF can +materialize reviewable summary, proactive brief, and scheduled-memory artifacts +through its own service list/readback path. It does not add provider-backed private +corpus coverage or hosted scheduler parity. + +## Claim Boundaries + +Allowed: + +- ELF has a reproducible service-native Dreaming readback benchmark for the checked + public/local `memory_summary`, `proactive_brief`, and `scheduled_memory` fixtures. +- The current service-native slice scores 9 pass, 0 wrong_result, and 2 typed + blockers with full evidence/source-ref/quote coverage. +- Passing jobs preserve source-readback audit metadata and record zero source + mutations. + +Not allowed: + +- Do not claim ELF broadly beats OpenAI Pulse, ChatGPT Tasks, Claude Dreams, or + hosted managed-memory products from this local service-native slice. +- Do not claim private-corpus or provider-backed Dreaming readiness until XY-930 + operator-owned inputs exist. +- Do not treat stale/trap refs preserved in materialization audit metadata as used + benchmark evidence. + +## Next Optimization Direction + +The next useful lane is XY-930: run private-corpus, provider-backed, and hosted +scheduler gates only when operator-owned inputs exist. Until then, optimization +should focus on surfacing these derived artifacts in operator UI/review workflows +without converting private/provider blockers into claimed wins. diff --git a/docs/evidence/benchmarking/index.md b/docs/evidence/benchmarking/index.md index 47a31d1a..6421ddeb 100644 --- a/docs/evidence/benchmarking/index.md +++ b/docs/evidence/benchmarking/index.md @@ -39,3 +39,4 @@ Routes to: Benchmarking evidence concepts under `docs/evidence/benchmarking/`. - `2026-06-19-letta-core-archive-export-readback-report.md`: Letta Core/Archive Export-Readback Report - June 19, 2026; adds a Docker-contained Letta materialization/report command while preserving all six core/archive comparison scenarios as typed blockers until exported core block JSON, archival readback/search JSON, and source ids exist. - `2026-06-19-openviking-trajectory-materialization-report.md`: OpenViking Trajectory Materialization Report - June 19, 2026; materializes the context-trajectory fixture slice through a dedicated repo task while preserving staged retrieval, hierarchy selection, and recursive/context expansion as typed blockers. - `2026-06-19-qmd-debug-ergonomics-dreaming-retest-report.md`: qmd Debug-Ergonomics Dreaming Retest Report - June 19, 2026; confirms qmd's default top-k/replay edge is unchanged while ELF keeps the narrow operator-debug trace/stage visibility wins. +- `2026-06-19-service-native-dreaming-readback-report.md`: Service-Native Dreaming Readback Report - June 19, 2026; materializes memory summary, proactive brief, and scheduled-memory derived outputs through `ElfService` readback with 9 pass, 0 wrong_result, and 2 typed XY-930 blockers. diff --git a/docs/log.md b/docs/log.md index fa379d2a..b6f87575 100644 --- a/docs/log.md +++ b/docs/log.md @@ -45,3 +45,7 @@ logs. for XY-984, plus `cargo make smoke-letta-core-archive-export-readback`, preserving all six Letta comparison scenarios as typed blockers until exported core block JSON, archival readback/search JSON, and fixture source ids exist. +- Added the service-native Dreaming readback report and snapshots for XY-986, plus + `cargo make real-world-memory-service-native-dreaming`, proving public/local + memory summary, proactive brief, and scheduled-memory artifacts can be materialized + through `ElfService` readback while preserving XY-930 private/provider blockers. diff --git a/scripts/real-world-docker.sh b/scripts/real-world-docker.sh index a6413839..ee7e9685 100755 --- a/scripts/real-world-docker.sh +++ b/scripts/real-world-docker.sh @@ -22,6 +22,12 @@ memory-live-consolidation) -e ELF_CONSOLIDATION_LIVE_FIXTURES \ baseline-runner bash scripts/real-world-consolidation-live-adapter.sh ;; +memory-service-native-dreaming) + docker compose -f docker-compose.baseline.yml run --build --rm \ + -e ELF_DREAMING_SERVICE_NATIVE_REPORT_DIR \ + -e ELF_DREAMING_SERVICE_NATIVE_FIXTURES \ + baseline-runner bash scripts/real-world-dreaming-service-native.sh + ;; memory-live-adapters) lightrag_start="$(printenv ELF_LIGHTRAG_CONTEXT_START || true)" graphiti_start="$(printenv ELF_GRAPHITI_ZEP_SMOKE_START || true)" diff --git a/scripts/real-world-dreaming-service-native.sh b/scripts/real-world-dreaming-service-native.sh new file mode 100755 index 00000000..f6592d39 --- /dev/null +++ b/scripts/real-world-dreaming-service-native.sh @@ -0,0 +1,88 @@ +#!/usr/bin/env bash +set -euo pipefail + +ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" +REPORT_DIR="${ELF_DREAMING_SERVICE_NATIVE_REPORT_DIR:-${ROOT_DIR}/tmp/real-world-memory/service-native-dreaming}" +FIXTURE_ROOT="${ELF_DREAMING_SERVICE_NATIVE_FIXTURES:-${ROOT_DIR}/apps/elf-eval/fixtures/real_world_memory}" +INPUT_FIXTURE_DIR="${REPORT_DIR}/input-fixtures" + +if [[ ! -f "/.dockerenv" && "${ELF_DREAMING_SERVICE_NATIVE_ALLOW_HOST:-0}" != "1" ]]; then + echo "Refusing to run service-native Dreaming readback outside Docker. Use cargo make real-world-memory-service-native-dreaming." >&2 + exit 1 +fi + +for cmd in bash cargo jq; do + if ! command -v "${cmd}" >/dev/null 2>&1; then + echo "Missing ${cmd} in service-native Dreaming readback runner." >&2 + exit 1 + fi +done + +mkdir -p "${REPORT_DIR}" +rm -rf "${INPUT_FIXTURE_DIR}" \ + "${REPORT_DIR:?}/elf-fixtures" \ + "${REPORT_DIR:?}/elf-materialization.json" \ + "${REPORT_DIR:?}/report.json" \ + "${REPORT_DIR:?}/report.md" \ + "${REPORT_DIR:?}/summary.json" + +mkdir -p "${INPUT_FIXTURE_DIR}" +cp -R "${FIXTURE_ROOT}/memory_summary" "${INPUT_FIXTURE_DIR}/memory_summary" +cp -R "${FIXTURE_ROOT}/proactive_brief" "${INPUT_FIXTURE_DIR}/proactive_brief" +cp -R "${FIXTURE_ROOT}/scheduled_memory" "${INPUT_FIXTURE_DIR}/scheduled_memory" + +cd "${ROOT_DIR}" + +cargo run -p elf-eval --bin real_world_live_adapter -- elf \ + --fixtures "${INPUT_FIXTURE_DIR}" \ + --out-fixtures "${REPORT_DIR}/elf-fixtures" \ + --evidence-out "${REPORT_DIR}/elf-materialization.json" \ + --config config/local/elf.docker.toml \ + --adapter-id elf_service_native_dreaming + +cargo run -p elf-eval --bin real_world_job_benchmark -- run \ + --fixtures "${REPORT_DIR}/elf-fixtures" \ + --out "${REPORT_DIR}/report.json" \ + --run-id real-world-memory-service-native-dreaming \ + --adapter-id elf_service_native_dreaming \ + --adapter-name "ELF service-native Dreaming readback adapter" \ + --adapter-behavior service_native_dreaming_readback \ + --adapter-storage-status pass \ + --adapter-runtime-status pass \ + --adapter-notes "Materialized through ElfService add_note/list/search readback for memory_summary, proactive_brief, and scheduled_memory fixtures. Private/provider blockers remain typed non-pass records under XY-930." + +cargo run -p elf-eval --bin real_world_job_benchmark -- publish \ + --report "${REPORT_DIR}/report.json" \ + --out "${REPORT_DIR}/report.md" + +jq -n \ + --slurpfile materialization "${REPORT_DIR}/elf-materialization.json" \ + --slurpfile report "${REPORT_DIR}/report.json" \ + '{ + schema: "elf.service_native_dreaming_readback_sweep/v1", + generated_at: (now | todateiso8601), + fixture_dir: (env.ELF_DREAMING_SERVICE_NATIVE_FIXTURES // "apps/elf-eval/fixtures/real_world_memory"), + artifact_dir: (env.ELF_DREAMING_SERVICE_NATIVE_REPORT_DIR // "tmp/real-world-memory/service-native-dreaming"), + adapter: { + adapter_id: "elf_service_native_dreaming", + evidence_class: "service_native_readback", + materialization: $materialization[0], + report: { + json: "tmp/real-world-memory/service-native-dreaming/report.json", + markdown: "tmp/real-world-memory/service-native-dreaming/report.md", + summary: $report[0].summary, + suites: $report[0].suites + } + }, + comparison_boundary: { + baseline: "XY-955 fixture-backed Dreaming outputs", + judgment_rule: "improved only when service-native readback scores source-linked artifacts without stale, tombstoned, unsupported, untraced, or source-mutation violations", + private_provider_boundary: "XY-930 remains blocked unless operator-owned manifest and explicit provider setup exist" + } + }' >"${REPORT_DIR}/summary.json" + +echo "Service-native Dreaming readback reports:" +echo " ${REPORT_DIR}/elf-materialization.json" +echo " ${REPORT_DIR}/report.json" +echo " ${REPORT_DIR}/report.md" +echo " ${REPORT_DIR}/summary.json"