From 8821b61aaf3029f356a2a00cbd9c6af41d33f31b Mon Sep 17 00:00:00 2001 From: Yvette Carlisle Date: Fri, 19 Jun 2026 14:03:33 +0800 Subject: [PATCH] {"schema":"decodex/commit/1","summary":"Materialize Letta core archive export benchmark","authority":"XY-984"} --- Makefile.toml | 9 + README.md | 8 + .../memory_projects_manifest.json | 124 +- ...a-core-archive-export-readback-report.json | 764 ++++++++++++ .../tests/real_world_job_benchmark.rs | 175 ++- docker-compose.baseline.yml | 14 + ...tta-core-archive-export-readback-report.md | 113 ++ docs/evidence/benchmarking/index.md | 1 + docs/log.md | 4 + ...etta-core-archive-export-readback-smoke.py | 1062 +++++++++++++++++ scripts/smoke-docker.sh | 25 + 11 files changed, 2234 insertions(+), 65 deletions(-) create mode 100644 apps/elf-eval/fixtures/report_snapshots/2026-06-19-letta-core-archive-export-readback-report.json create mode 100644 docs/evidence/benchmarking/2026-06-19-letta-core-archive-export-readback-report.md create mode 100755 scripts/letta-core-archive-export-readback-smoke.py diff --git a/Makefile.toml b/Makefile.toml index b9b93a83..4505b75e 100644 --- a/Makefile.toml +++ b/Makefile.toml @@ -1276,6 +1276,7 @@ args = [ # | smoke-graphify-docker-graph-report | command | | # | smoke-graphiti-zep-docker-temporal | command | | # | smoke-graphrag-docker | command | | +# | smoke-letta-core-archive-export-readback | command | | # | smoke-lightrag-docker-context | command | | # | smoke-ragflow-docker | command | | # | smoke-real-world-job | composite | | @@ -1306,6 +1307,14 @@ args = [ "graphrag-docker", ] +[tasks.smoke-letta-core-archive-export-readback] +workspace = false +command = "bash" +args = [ + "scripts/smoke-docker.sh", + "letta-core-archive-export-readback", +] + [tasks.smoke-lightrag-docker-context] workspace = false command = "bash" diff --git a/README.md b/README.md index 55bf95e2..0c5eb979 100644 --- a/README.md +++ b/README.md @@ -191,6 +191,14 @@ provider-backed ELF evidence was required. as 0 pass, 0 wrong_result, and 3 typed blockers with 9/9 evidence coverage. This improves auditability but does not remove the OpenViking context-trajectory gap or support any ELF win, tie, or loss claim on those strengths. +- Letta core/archive materialization after XY-984: the June 19 follow-up adds + `cargo make smoke-letta-core-archive-export-readback`, a Docker-contained + materialization/report command for the six `core_archival_memory` scenarios. The + default run scores 0 pass, 0 wrong_result, and 6 typed blockers with 14/14 evidence, + source-ref, and quote coverage. This improves the Letta audit path but keeps the + competitive status unchanged: no ELF-over-Letta win, tie, or loss is allowed until + exported Letta core block JSON, archival readback/search JSON, and fixture source ids + are present. - Full-suite live real-world adapter sweep after XY-926: ELF and qmd emit Docker-isolated `live_real_world` records for all 55 checked-in jobs across 13 suites through `cargo make real-world-memory-live-adapters`. Both keep the original diff --git a/apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json b/apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json index 00490fc1..109cb8d8 100644 --- a/apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json +++ b/apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json @@ -2379,31 +2379,41 @@ "overall_status": "blocked", "setup": { "status": "blocked", - "evidence": "Letta is D1 reviewed as a core/archival memory reference. The contained comparison contract is a Docker-only benchmark-created agent export that must return core block JSON, archival search readback, and source ids before any scenario claim is scored." + "evidence": "Letta is D1 reviewed as a core/archival memory reference. The contained comparison contract now has cargo make smoke-letta-core-archive-export-readback, a Docker-only benchmark-created agent export/readback materializer that must return core block JSON, archival search/readback JSON, and source ids before any scenario claim is scored.", + "command": "cargo make smoke-letta-core-archive-export-readback", + "artifact": "tmp/real-world-memory/letta-core-archive/letta-core-archive-export.json" }, "run": { - "status": "not_encoded", - "evidence": "No Letta materializer currently creates the benchmark agent, imports the ELF core_archival_memory fixture corpus, or exports comparable core and archival evidence." + "status": "blocked", + "evidence": "The default materializer emits a typed blocked report unless a Docker-local Letta server and explicit model/provider configuration produce benchmark-owned core block export and archival readback/search output.", + "command": "ELF_LETTA_SMOKE_START=1 ELF_LETTA_SMOKE_RUN=1 cargo make smoke-letta-core-archive-export-readback", + "artifact": "tmp/real-world-memory/letta-core-archive/summary.json" }, "result": { - "status": "not_encoded", - "evidence": "No Letta core block, archival fallback, stale-core, scope, provenance, or project-decision result is claimed." + "status": "blocked", + "evidence": "No Letta core block, archival fallback, stale-core, scope, provenance, or project-decision pass/win/tie/loss is claimed until the generated export/readback artifact maps required source ids.", + "artifact": "tmp/real-world-memory/letta-core-archive/report.json" }, "capabilities": [ { "capability": "core_archival_memory", "status": "blocked", - "evidence": "ELF fixture jobs now score core block attachment, scope, provenance, stale-core detection, archival fallback, and project-decision recovery separately from archival note search; Letta remains blocked until its export maps equivalent source ids." + "evidence": "ELF fixture jobs score core block attachment, scope, provenance, stale-core detection, archival fallback, and project-decision recovery separately from archival note search; Letta remains blocked until its export maps equivalent source ids." }, { "capability": "docker_embedding_configuration", "status": "blocked", - "evidence": "Docker setup requires explicit embedding configuration before archival retrieval can be tested." + "evidence": "Official Docker setup requires explicit embedding configuration before archival retrieval can be tested." }, { "capability": "real_world_job_adapter", + "status": "blocked", + "evidence": "A Docker-contained materializer now exists and emits typed blocked evidence by default; live scoring still requires exported Letta core blocks, archival list/search JSON, and source-id mappings." + }, + { + "capability": "broad_letta_quality_claim", "status": "not_encoded", - "evidence": "No Letta materializer or scorer mapping exists." + "evidence": "The materializer does not score broad Letta product quality, hosted/private state, personalization breadth, or production durability." } ], "suites": [ @@ -2414,8 +2424,8 @@ }, { "suite_id": "project_decisions", - "status": "not_encoded", - "evidence": "Archival memory decision retrieval is not encoded for Letta." + "status": "blocked", + "evidence": "The project-decision recovery row is represented only through the core_archival_memory export/readback materializer and remains blocked without mapped source ids." }, { "suite_id": "work_resume", @@ -2425,36 +2435,39 @@ { "suite_id": "core_archival_memory", "status": "blocked", - "evidence": "ELF fixture coverage exists, but Letta has no contained export/readback artifact for the same core-vs-archival jobs." + "evidence": "A Docker-contained materializer now emits the core_archival_memory scenarios as typed blocked unless live Letta export/readback maps core block JSON, archival search/readback JSON, and source ids." } ], "scenarios": [ { "scenario_id": "core_block_attachment_readback", "suite_id": "core_archival_memory", - "status": "not_encoded", + "status": "blocked", "elf_position": "untested", - "comparison_outcome": "not_tested", - "evidence": "ELF fixture core-archival-core-block-attachment-001 scores exact core block attachment and keeps core readback out of Qdrant-backed archival search. Letta has no comparable exported core block attachment evidence.", - "artifact": "apps/elf-eval/fixtures/real_world_memory/core_archival_memory/core_block_attachment.json" + "comparison_outcome": "blocked", + "evidence": "ELF fixture core-archival-core-block-attachment-001 scores exact core block attachment and keeps core readback out of Qdrant-backed archival search. Letta remains blocked until the generated export/readback artifact maps this core block attachment source id.", + "artifact": "tmp/real-world-memory/letta-core-archive/summary.json", + "command": "cargo make smoke-letta-core-archive-export-readback" }, { "scenario_id": "core_block_scope_readback", "suite_id": "core_archival_memory", - "status": "not_encoded", + "status": "blocked", "elf_position": "untested", - "comparison_outcome": "not_tested", - "evidence": "ELF fixture core-archival-core-block-scope-001 scores read_profile, shared scope, and private-owner boundaries. Letta scope behavior remains unscored without a contained export of agent, block, and visibility metadata.", - "artifact": "apps/elf-eval/fixtures/real_world_memory/core_archival_memory/core_block_scope.json" + "comparison_outcome": "blocked", + "evidence": "ELF fixture core-archival-core-block-scope-001 scores read_profile, shared scope, and private-owner boundaries. Letta scope behavior remains blocked until the generated export includes agent, block, visibility metadata, and source ids.", + "artifact": "tmp/real-world-memory/letta-core-archive/summary.json", + "command": "cargo make smoke-letta-core-archive-export-readback" }, { "scenario_id": "core_block_provenance_readback", "suite_id": "core_archival_memory", - "status": "not_encoded", + "status": "blocked", "elf_position": "untested", - "comparison_outcome": "not_tested", - "evidence": "ELF fixture core-archival-core-block-provenance-001 scores source_ref and audit_history readback. Letta provenance remains not_tested until exported core memory includes stable source ids and audit-equivalent events.", - "artifact": "apps/elf-eval/fixtures/real_world_memory/core_archival_memory/core_block_provenance.json" + "comparison_outcome": "blocked", + "evidence": "ELF fixture core-archival-core-block-provenance-001 scores source_ref and audit_history readback. Letta provenance remains blocked until exported core memory includes stable source ids and audit-equivalent events.", + "artifact": "tmp/real-world-memory/letta-core-archive/summary.json", + "command": "cargo make smoke-letta-core-archive-export-readback" }, { "scenario_id": "stale_core_detection", @@ -2463,7 +2476,8 @@ "elf_position": "untested", "comparison_outcome": "blocked", "evidence": "ELF fixture core-archival-stale-core-detection-001 scores archival evidence superseding a stale core block. Letta stale-core comparison is blocked until core export and archival readback can be joined by source ids.", - "artifact": "apps/elf-eval/fixtures/real_world_memory/core_archival_memory/stale_core_detection.json" + "artifact": "tmp/real-world-memory/letta-core-archive/summary.json", + "command": "cargo make smoke-letta-core-archive-export-readback" }, { "scenario_id": "archival_fallback_readback", @@ -2472,52 +2486,74 @@ "elf_position": "untested", "comparison_outcome": "blocked", "evidence": "ELF fixture core-archival-archival-fallback-001 scores fallback from insufficient core memory to archival note search. Letta fallback comparison is blocked until archival search output can be exported with source ids.", - "artifact": "apps/elf-eval/fixtures/real_world_memory/core_archival_memory/archival_fallback.json" + "artifact": "tmp/real-world-memory/letta-core-archive/summary.json", + "command": "cargo make smoke-letta-core-archive-export-readback" }, { "scenario_id": "core_archival_project_decision_recovery", "suite_id": "core_archival_memory", - "status": "not_encoded", + "status": "blocked", "elf_position": "untested", - "comparison_outcome": "not_tested", - "evidence": "ELF fixture core-archival-project-decision-recovery-001 scores core routing plus archival decision rationale. Letta project-decision recovery remains not_tested until the contained export/readback contract exists.", - "artifact": "apps/elf-eval/fixtures/real_world_memory/core_archival_memory/project_decision_recovery.json" + "comparison_outcome": "blocked", + "evidence": "ELF fixture core-archival-project-decision-recovery-001 scores core routing plus archival decision rationale. Letta project-decision recovery remains blocked until the generated export/readback artifact maps core routing plus archival rationale source ids.", + "artifact": "tmp/real-world-memory/letta-core-archive/summary.json", + "command": "cargo make smoke-letta-core-archive-export-readback" } ], "evidence": [ + { + "kind": "artifact", + "ref": "tmp/real-world-memory/letta-core-archive/letta-core-archive-export.json", + "status": "blocked" + }, + { + "kind": "artifact", + "ref": "tmp/real-world-memory/letta-core-archive/summary.json", + "status": "blocked" + }, + { + "kind": "source", + "ref": "https://docs.letta.com/guides/docker", + "status": "real" + }, { "kind": "source", - "ref": "https://github.com/letta-ai/letta", + "ref": "https://docs.letta.com/api/python", "status": "real" }, { "kind": "source", - "ref": "https://docs.letta.com/guides/docker/", + "ref": "https://docs.letta.com/api/resources/agents/subresources/passages/methods/search", "status": "real" } ], "execution_metadata": { "sources": [ { - "label": "Letta repository", - "url": "https://github.com/letta-ai/letta", - "evidence": "Official source for Letta stateful agents and memory." + "label": "Letta Docker docs", + "url": "https://docs.letta.com/guides/docker", + "evidence": "Official Docker setup and explicit embedding configuration boundary." }, { - "label": "Letta Docker docs", - "url": "https://docs.letta.com/guides/docker/", - "evidence": "Official Docker deployment guide and embedding configuration boundary." + "label": "Letta Python API", + "url": "https://docs.letta.com/api/python", + "evidence": "Official Python SDK memory block creation and retrieval examples." + }, + { + "label": "Letta archival search API", + "url": "https://docs.letta.com/api/resources/agents/subresources/passages/methods/search", + "evidence": "Official archival-memory search endpoint contract." } ], - "setup_path": "Use a Docker-only Letta server or CLI flow that creates a benchmark-owned agent, loads the checked-in core_archival_memory fixture corpus, writes core memory and archival memory with fixture source ids, then exports core block JSON plus archival search/readback JSON.", - "runtime_boundary": "Docker-only Letta server or CLI flow with benchmark-created agents, benchmark-owned storage, no host-global state, and no unstated hosted service dependency.", - "resource_expectation": "Embedding model, agent server state, exported core memory, archival search output, and provider boundaries must be explicit in the artifact.", + "setup_path": "Run cargo make smoke-letta-core-archive-export-readback for a typed artifact; set ELF_LETTA_SMOKE_START=1 ELF_LETTA_SMOKE_RUN=1 with explicit model/provider configuration for a live export attempt. The smoke exports core block JSON plus archival search/readback JSON when Letta setup succeeds.", + "runtime_boundary": "docker-compose.baseline.yml baseline-runner plus optional Letta server profile, benchmark-created agent, benchmark-owned fixture corpus, no hosted/private state, and artifacts under tmp/real-world-memory/letta-core-archive.", + "resource_expectation": "Letta Docker server, Python SDK client, explicit model and embedding configuration, exported core memory, archival search output, and provider boundaries must be explicit in the artifact.", "retry_guidance": [ - "Create a tiny Docker agent with core memory and archival memory loaded from the ELF core_archival_memory fixtures.", - "Export core block readback, archival search results, source ids, and any audit-equivalent metadata as JSON before scoring.", - "Score core-versus-archival scenarios only after source evidence can be exported and mapped to the fixture evidence ids." + "Default command records a typed blocked artifact without model calls.", + "Enable the live path only with Docker-local Letta and explicit provider or local model configuration.", + "Score core-versus-archival scenarios only after core block export and archival list/search output map to fixture evidence ids." ], - "research_depth": "D1 feasibility verdict: research_only (XY-882); XY-927 selects the contained export/readback contract, but the Letta adapter remains blocked until that artifact exists" + "research_depth": "D1 feasibility verdict: research_only (XY-882); XY-927 selected the contained export/readback contract; XY-984 adds the Docker-contained materializer and keeps the comparison blocked until live export evidence maps source ids." } }, { diff --git a/apps/elf-eval/fixtures/report_snapshots/2026-06-19-letta-core-archive-export-readback-report.json b/apps/elf-eval/fixtures/report_snapshots/2026-06-19-letta-core-archive-export-readback-report.json new file mode 100644 index 00000000..81a2c8c8 --- /dev/null +++ b/apps/elf-eval/fixtures/report_snapshots/2026-06-19-letta-core-archive-export-readback-report.json @@ -0,0 +1,764 @@ +{ + "adapter_id": "letta_core_archive_export_readback", + "evidence_class": "research_gate", + "generated_at": "2026-06-19T05:50:58Z", + "manifest": { + "json": "tmp/real-world-memory/letta-core-archive/memory_projects_manifest.letta-core-archive.json", + "status_source": "external_adapter_manifest_score_aligned", + "suites": [ + { + "evidence": "Only the six checked-in core_archival_memory scenarios are represented.", + "status": "blocked", + "suite_id": "core_archival_memory" + }, + { + "evidence": "Scoped preference behavior is outside this core/archive export smoke.", + "status": "not_encoded", + "suite_id": "personalization" + }, + { + "evidence": "Project-decision recovery is scored only through the core_archival_memory fixture that requires core routing plus archival rationale source ids.", + "status": "blocked", + "suite_id": "project_decisions" + }, + { + "evidence": "Agent resumption across sessions is not encoded by this export/readback smoke.", + "status": "not_encoded", + "suite_id": "work_resume" + } + ], + "summary": "blocked" + }, + "materialization": { + "adapter_id": "letta_core_archive_export_readback", + "artifacts": { + "generated_fixture_dir": "tmp/real-world-memory/letta-core-archive/letta-fixtures/core_archival_memory", + "live_output": null, + "manifest": "tmp/real-world-memory/letta-core-archive/memory_projects_manifest.letta-core-archive.json", + "materialization": "tmp/real-world-memory/letta-core-archive/letta-core-archive-export.json", + "scored_report_json": "tmp/real-world-memory/letta-core-archive/report.json", + "scored_report_markdown": "tmp/real-world-memory/letta-core-archive/report.md", + "summary": "tmp/real-world-memory/letta-core-archive/summary.json" + }, + "benchmark_input": { + "archival_passages": [ + { + "fixture_source": "apps/elf-eval/fixtures/real_world_memory/core_archival_memory/archival_fallback.json", + "job_id": "core-archival-archival-fallback-001", + "kind": "runbook", + "source_id": "fallback-archival-runbook", + "text": "Archival rollback note: restore the Postgres backup, rebuild Qdrant from Postgres chunk vectors, and verify search recovers the restored note." + }, + { + "fixture_source": "apps/elf-eval/fixtures/real_world_memory/core_archival_memory/project_decision_recovery.json", + "job_id": "core-archival-project-decision-recovery-001", + "kind": "decision", + "source_id": "decision-archival-outcome-policy", + "text": "Archival decision record: scenario outcomes use win, tie, loss, not_tested, blocked, or non_goal only when scenario evidence supports them." + }, + { + "fixture_source": "apps/elf-eval/fixtures/real_world_memory/core_archival_memory/project_decision_recovery.json", + "job_id": "core-archival-project-decision-recovery-001", + "kind": "decision", + "source_id": "decision-archival-core-search-boundary", + "text": "Archival project decision: core blocks stay separate from archival note search and Qdrant-derived retrieval." + }, + { + "fixture_source": "apps/elf-eval/fixtures/real_world_memory/core_archival_memory/project_decision_recovery.json", + "job_id": "core-archival-project-decision-recovery-001", + "kind": "comparison_boundary", + "source_id": "decision-letta-export-boundary", + "text": "Letta comparison boundary: no contained export/readback artifact maps core block JSON, archival search/readback JSON, and source ids, so Letta remains blocked or not_tested and no win, tie, or loss claim is allowed." + }, + { + "fixture_source": "apps/elf-eval/fixtures/real_world_memory/core_archival_memory/stale_core_detection.json", + "job_id": "core-archival-stale-core-detection-001", + "kind": "decision", + "source_id": "archival-current-validation-gate", + "text": "Archival decision update: before pushing a refreshed PR head, run cargo make fmt, cargo make lint-fix, and cargo make check." + }, + { + "fixture_source": "apps/elf-eval/fixtures/real_world_memory/core_archival_memory/stale_core_detection.json", + "job_id": "core-archival-stale-core-detection-001", + "kind": "decision", + "source_id": "archival-supersedes-core-rationale", + "text": "Rationale: archival note evidence supersedes the attached core block until the core block is updated from source-of-truth state." + } + ], + "core_blocks": [ + { + "fixture_source": "apps/elf-eval/fixtures/real_world_memory/core_archival_memory/archival_fallback.json", + "job_id": "core-archival-archival-fallback-001", + "kind": "core_block", + "label": "fallback-core-insufficient", + "source_id": "fallback-core-insufficient", + "text": "Core block summary: a rollback runbook exists for single-user production, but this core block intentionally omits the rollback steps.", + "value": "Source ID: fallback-core-insufficient\nCore block summary: a rollback runbook exists for single-user production, but this core block intentionally omits the rollback steps." + }, + { + "fixture_source": "apps/elf-eval/fixtures/real_world_memory/core_archival_memory/core_block_attachment.json", + "job_id": "core-archival-core-block-attachment-001", + "kind": "core_block", + "label": "core-attachment-active", + "source_id": "core-attachment-active", + "text": "Core block attachment: key project_style has an active attachment for tenant local-tenant project ELF agent local-agent read_profile private_plus_project.", + "value": "Source ID: core-attachment-active\nCore block attachment: key project_style has an active attachment for tenant local-tenant project ELF agent local-agent read_profile private_plus_project." + }, + { + "fixture_source": "apps/elf-eval/fixtures/real_world_memory/core_archival_memory/core_block_attachment.json", + "job_id": "core-archival-core-block-attachment-001", + "kind": "core_block_contract", + "label": "core-attachment-not-search", + "source_id": "core-attachment-not-search", + "text": "Core block readback is not archival search; it does not embed, rerank, search Qdrant, create a search session, or record note hits.", + "value": "Source ID: core-attachment-not-search\nCore block readback is not archival search; it does not embed, rerank, search Qdrant, create a search session, or record note hits." + }, + { + "fixture_source": "apps/elf-eval/fixtures/real_world_memory/core_archival_memory/core_block_provenance.json", + "job_id": "core-archival-core-block-provenance-001", + "kind": "core_block", + "label": "core-provenance-source-ref", + "source_id": "core-provenance-source-ref", + "text": "Provenance evidence: core block release_policy returns source_ref schema source_ref/v1 with resolver real_world_job_fixture/v1 and locator quote retained for reviewer inspection.", + "value": "Source ID: core-provenance-source-ref\nProvenance evidence: core block release_policy returns source_ref schema source_ref/v1 with resolver real_world_job_fixture/v1 and locator quote retained for reviewer inspection." + }, + { + "fixture_source": "apps/elf-eval/fixtures/real_world_memory/core_archival_memory/core_block_provenance.json", + "job_id": "core-archival-core-block-provenance-001", + "kind": "core_block_event", + "label": "core-provenance-audit-events", + "source_id": "core-provenance-audit-events", + "text": "Audit evidence: release_policy has append-only events block_created, block_updated, and attachment_added returned in audit_history.", + "value": "Source ID: core-provenance-audit-events\nAudit evidence: release_policy has append-only events block_created, block_updated, and attachment_added returned in audit_history." + }, + { + "fixture_source": "apps/elf-eval/fixtures/real_world_memory/core_archival_memory/core_block_scope.json", + "job_id": "core-archival-core-block-scope-001", + "kind": "core_block", + "label": "core-scope-project-shared-readable", + "source_id": "core-scope-project-shared-readable", + "text": "Scope evidence: project_shared block release_gate is readable for tenant local-tenant project ELF agent local-agent only when the active attachment and read_profile all_scopes allow project_shared.", + "value": "Source ID: core-scope-project-shared-readable\nScope evidence: project_shared block release_gate is readable for tenant local-tenant project ELF agent local-agent only when the active attachment and read_profile all_scopes allow project_shared." + }, + { + "fixture_source": "apps/elf-eval/fixtures/real_world_memory/core_archival_memory/core_block_scope.json", + "job_id": "core-archival-core-block-scope-001", + "kind": "core_block", + "label": "core-scope-private-owner", + "source_id": "core-scope-private-owner", + "text": "Private owner evidence: agent_private block agent_a_workflow belongs to agent-a and must not be returned to agent-b even if agent-b has a matching read_profile label.", + "value": "Source ID: core-scope-private-owner\nPrivate owner evidence: agent_private block agent_a_workflow belongs to agent-a and must not be returned to agent-b even if agent-b has a matching read_profile label." + }, + { + "fixture_source": "apps/elf-eval/fixtures/real_world_memory/core_archival_memory/project_decision_recovery.json", + "job_id": "core-archival-project-decision-recovery-001", + "kind": "core_block", + "label": "decision-core-routing-block", + "source_id": "decision-core-routing-block", + "text": "Core decision routing block: keep the benchmark outcome policy always attached and route detailed rationale to archival notes.", + "value": "Source ID: decision-core-routing-block\nCore decision routing block: keep the benchmark outcome policy always attached and route detailed rationale to archival notes." + }, + { + "fixture_source": "apps/elf-eval/fixtures/real_world_memory/core_archival_memory/stale_core_detection.json", + "job_id": "core-archival-stale-core-detection-001", + "kind": "core_block", + "label": "stale-core-validation-gate", + "source_id": "stale-core-validation-gate", + "text": "Stale core block: the validation gate is cargo make lint and cargo make test.", + "value": "Source ID: stale-core-validation-gate\nStale core block: the validation gate is cargo make lint and cargo make test." + } + ], + "required_evidence_ids": [ + "fallback-core-insufficient", + "fallback-archival-runbook", + "core-attachment-active", + "core-attachment-not-search", + "core-provenance-source-ref", + "core-provenance-audit-events", + "core-scope-project-shared-readable", + "core-scope-private-owner", + "decision-core-routing-block", + "decision-archival-outcome-policy", + "decision-archival-core-search-boundary", + "decision-letta-export-boundary", + "archival-current-validation-gate", + "archival-supersedes-core-rationale" + ], + "source_id_count": 21 + }, + "claim_boundaries": { + "allowed": [ + "The Letta comparison now has a reproducible Docker-contained materialization/report command.", + "The current default report may preserve typed blockers when live Letta/provider setup cannot produce export/readback evidence." + ], + "not_allowed": [ + "Do not claim ELF beats Letta on core-vs-archival memory from fixture-only ELF evidence.", + "Do not score Letta pass, win, tie, or loss unless exported core block JSON, archival readback/search JSON, and fixture source ids are present." + ] + }, + "commands": [], + "docker_boundary": { + "compose_file": "docker-compose.baseline.yml", + "docker_only": true, + "host_global_installs_required": false, + "host_global_letta_state_used": false, + "hosted_letta_state_used": false, + "runner": "scripts/letta-core-archive-export-readback-smoke.py", + "runner_service": "baseline-runner", + "service_profile": "letta" + }, + "evidence_class": "research_gate", + "evidence_mapping": { + "expected_evidence_ids": [ + "fallback-core-insufficient", + "fallback-archival-runbook", + "core-attachment-active", + "core-attachment-not-search", + "core-provenance-source-ref", + "core-provenance-audit-events", + "core-scope-project-shared-readable", + "core-scope-private-owner", + "decision-core-routing-block", + "decision-archival-outcome-policy", + "decision-archival-core-search-boundary", + "decision-letta-export-boundary", + "archival-current-validation-gate", + "archival-supersedes-core-rationale" + ], + "jobs": [ + { + "expected_evidence_ids": [ + "fallback-core-insufficient", + "fallback-archival-runbook" + ], + "job_id": "core-archival-archival-fallback-001", + "mapped_evidence_ids": [], + "missing_evidence_ids": [ + "fallback-core-insufficient", + "fallback-archival-runbook" + ], + "reason": "Letta live export/readback is disabled by default; run ELF_LETTA_SMOKE_START=1 ELF_LETTA_SMOKE_RUN=1 cargo make smoke-letta-core-archive-export-readback with explicit Docker/provider configuration.", + "source_fixture": "apps/elf-eval/fixtures/real_world_memory/core_archival_memory/archival_fallback.json", + "status": "blocked" + }, + { + "expected_evidence_ids": [ + "core-attachment-active", + "core-attachment-not-search" + ], + "job_id": "core-archival-core-block-attachment-001", + "mapped_evidence_ids": [], + "missing_evidence_ids": [ + "core-attachment-active", + "core-attachment-not-search" + ], + "reason": "Letta live export/readback is disabled by default; run ELF_LETTA_SMOKE_START=1 ELF_LETTA_SMOKE_RUN=1 cargo make smoke-letta-core-archive-export-readback with explicit Docker/provider configuration.", + "source_fixture": "apps/elf-eval/fixtures/real_world_memory/core_archival_memory/core_block_attachment.json", + "status": "blocked" + }, + { + "expected_evidence_ids": [ + "core-provenance-source-ref", + "core-provenance-audit-events" + ], + "job_id": "core-archival-core-block-provenance-001", + "mapped_evidence_ids": [], + "missing_evidence_ids": [ + "core-provenance-source-ref", + "core-provenance-audit-events" + ], + "reason": "Letta live export/readback is disabled by default; run ELF_LETTA_SMOKE_START=1 ELF_LETTA_SMOKE_RUN=1 cargo make smoke-letta-core-archive-export-readback with explicit Docker/provider configuration.", + "source_fixture": "apps/elf-eval/fixtures/real_world_memory/core_archival_memory/core_block_provenance.json", + "status": "blocked" + }, + { + "expected_evidence_ids": [ + "core-scope-project-shared-readable", + "core-scope-private-owner" + ], + "job_id": "core-archival-core-block-scope-001", + "mapped_evidence_ids": [], + "missing_evidence_ids": [ + "core-scope-project-shared-readable", + "core-scope-private-owner" + ], + "reason": "Letta live export/readback is disabled by default; run ELF_LETTA_SMOKE_START=1 ELF_LETTA_SMOKE_RUN=1 cargo make smoke-letta-core-archive-export-readback with explicit Docker/provider configuration.", + "source_fixture": "apps/elf-eval/fixtures/real_world_memory/core_archival_memory/core_block_scope.json", + "status": "blocked" + }, + { + "expected_evidence_ids": [ + "decision-core-routing-block", + "decision-archival-outcome-policy", + "decision-archival-core-search-boundary", + "decision-letta-export-boundary" + ], + "job_id": "core-archival-project-decision-recovery-001", + "mapped_evidence_ids": [], + "missing_evidence_ids": [ + "decision-core-routing-block", + "decision-archival-outcome-policy", + "decision-archival-core-search-boundary", + "decision-letta-export-boundary" + ], + "reason": "Letta live export/readback is disabled by default; run ELF_LETTA_SMOKE_START=1 ELF_LETTA_SMOKE_RUN=1 cargo make smoke-letta-core-archive-export-readback with explicit Docker/provider configuration.", + "source_fixture": "apps/elf-eval/fixtures/real_world_memory/core_archival_memory/project_decision_recovery.json", + "status": "blocked" + }, + { + "expected_evidence_ids": [ + "archival-current-validation-gate", + "archival-supersedes-core-rationale" + ], + "job_id": "core-archival-stale-core-detection-001", + "mapped_evidence_ids": [], + "missing_evidence_ids": [ + "archival-current-validation-gate", + "archival-supersedes-core-rationale" + ], + "reason": "Letta live export/readback is disabled by default; run ELF_LETTA_SMOKE_START=1 ELF_LETTA_SMOKE_RUN=1 cargo make smoke-letta-core-archive-export-readback with explicit Docker/provider configuration.", + "source_fixture": "apps/elf-eval/fixtures/real_world_memory/core_archival_memory/stale_core_detection.json", + "status": "blocked" + } + ], + "mapped_evidence_ids": [], + "missing_evidence_ids": [ + "fallback-core-insufficient", + "fallback-archival-runbook", + "core-attachment-active", + "core-attachment-not-search", + "core-provenance-source-ref", + "core-provenance-audit-events", + "core-scope-project-shared-readable", + "core-scope-private-owner", + "decision-core-routing-block", + "decision-archival-outcome-policy", + "decision-archival-core-search-boundary", + "decision-letta-export-boundary", + "archival-current-validation-gate", + "archival-supersedes-core-rationale" + ], + "reason": "Letta live export/readback is disabled by default; run ELF_LETTA_SMOKE_START=1 ELF_LETTA_SMOKE_RUN=1 cargo make smoke-letta-core-archive-export-readback with explicit Docker/provider configuration.", + "status": "blocked" + }, + "generated_at": "2026-06-19T05:50:58Z", + "improvement_regression_readback": { + "baseline": "XY-955 left Letta core/archive comparison blocked because no contained export/readback artifact existed.", + "current": "unchanged: the benchmark now has a Docker-contained materialization command and typed report, but the default run still preserves Letta comparison as blocked until live export/search data maps source ids.", + "judgment": "unchanged" + }, + "letta_export": { + "agent": null, + "archival_readback_json": null, + "archival_search_json": [], + "core_block_json": [], + "status": "blocked" + }, + "project": "Letta", + "provider_configuration": { + "base_url": "http://letta:8283", + "client_package": "pypi:letta-client", + "embedding": "openai/text-embedding-3-small", + "live_run_enabled": false, + "model": "openai/gpt-4o-mini", + "operator_owned_provider_credentials_used": false + }, + "resource_bounds": { + "archival_passage_count": 6, + "core_block_count": 9, + "elapsed_ms": 10933.304, + "source_fixture_count": 6, + "timeout_seconds": 600 + }, + "run_id": "letta-core-archive-20260619055047", + "schema": "elf.letta_core_archive_export_readback/v1", + "scored_benchmark": { + "counts": { + "blocked": 6, + "incomplete": 0, + "lifecycle_fail": 0, + "not_encoded": 0, + "pass": 0, + "wrong_result": 0 + }, + "evidence_coverage": 1.0, + "job_count": 6, + "mean_score": 0.0, + "schema": "elf.scored_benchmark_status/v1", + "source": "real_world_job_benchmark", + "status": "blocked" + }, + "status": { + "failure_class": "letta_live_run_disabled", + "failure_reason": "Letta live export/readback is disabled by default; run ELF_LETTA_SMOKE_START=1 ELF_LETTA_SMOKE_RUN=1 cargo make smoke-letta-core-archive-export-readback with explicit Docker/provider configuration.", + "overall": "blocked", + "result": "blocked", + "run": "not_encoded", + "setup": "blocked", + "source": "smoke_materialization" + } + }, + "report": { + "json": "tmp/real-world-memory/letta-core-archive/report.json", + "markdown": "tmp/real-world-memory/letta-core-archive/report.md", + "suites": [ + { + "conflict_detection_count": 0, + "encoded_job_count": 0, + "expected_evidence_recall": null, + "history_readback_encoded_count": 0, + "irrelevant_context_ratio": null, + "reason": "No checked-in real_world_job fixture is encoded for this suite.", + "score_mean": null, + "stale_answer_count": 0, + "status": "not_encoded", + "suite_id": "trust_source_of_truth", + "temporal_validity_not_encoded_count": 0, + "trace_explainability_count": 0, + "unsupported_claim_count": 0, + "update_rationale_available_count": 0, + "wrong_result_count": 0 + }, + { + "conflict_detection_count": 0, + "encoded_job_count": 0, + "expected_evidence_recall": null, + "history_readback_encoded_count": 0, + "irrelevant_context_ratio": null, + "reason": "No checked-in real_world_job fixture is encoded for this suite.", + "score_mean": null, + "stale_answer_count": 0, + "status": "not_encoded", + "suite_id": "work_resume", + "temporal_validity_not_encoded_count": 0, + "trace_explainability_count": 0, + "unsupported_claim_count": 0, + "update_rationale_available_count": 0, + "wrong_result_count": 0 + }, + { + "conflict_detection_count": 0, + "encoded_job_count": 0, + "expected_evidence_recall": null, + "history_readback_encoded_count": 0, + "irrelevant_context_ratio": null, + "reason": "No checked-in real_world_job fixture is encoded for this suite.", + "score_mean": null, + "stale_answer_count": 0, + "status": "not_encoded", + "suite_id": "project_decisions", + "temporal_validity_not_encoded_count": 0, + "trace_explainability_count": 0, + "unsupported_claim_count": 0, + "update_rationale_available_count": 0, + "wrong_result_count": 0 + }, + { + "conflict_detection_count": 0, + "encoded_job_count": 0, + "expected_evidence_recall": null, + "history_readback_encoded_count": 0, + "irrelevant_context_ratio": null, + "reason": "No checked-in real_world_job fixture is encoded for this suite.", + "score_mean": null, + "stale_answer_count": 0, + "status": "not_encoded", + "suite_id": "retrieval", + "temporal_validity_not_encoded_count": 0, + "trace_explainability_count": 0, + "unsupported_claim_count": 0, + "update_rationale_available_count": 0, + "wrong_result_count": 0 + }, + { + "conflict_detection_count": 0, + "encoded_job_count": 0, + "expected_evidence_recall": null, + "history_readback_encoded_count": 0, + "irrelevant_context_ratio": null, + "reason": "No checked-in real_world_job fixture is encoded for this suite.", + "score_mean": null, + "stale_answer_count": 0, + "status": "not_encoded", + "suite_id": "memory_evolution", + "temporal_validity_not_encoded_count": 0, + "trace_explainability_count": 0, + "unsupported_claim_count": 0, + "update_rationale_available_count": 0, + "wrong_result_count": 0 + }, + { + "conflict_detection_count": 0, + "encoded_job_count": 0, + "expected_evidence_recall": null, + "history_readback_encoded_count": 0, + "irrelevant_context_ratio": null, + "reason": "No checked-in real_world_job fixture is encoded for this suite.", + "score_mean": null, + "stale_answer_count": 0, + "status": "not_encoded", + "suite_id": "consolidation", + "temporal_validity_not_encoded_count": 0, + "trace_explainability_count": 0, + "unsupported_claim_count": 0, + "update_rationale_available_count": 0, + "wrong_result_count": 0 + }, + { + "conflict_detection_count": 0, + "encoded_job_count": 0, + "expected_evidence_recall": null, + "history_readback_encoded_count": 0, + "irrelevant_context_ratio": null, + "reason": "No checked-in real_world_job fixture is encoded for this suite.", + "score_mean": null, + "stale_answer_count": 0, + "status": "not_encoded", + "suite_id": "memory_summary", + "temporal_validity_not_encoded_count": 0, + "trace_explainability_count": 0, + "unsupported_claim_count": 0, + "update_rationale_available_count": 0, + "wrong_result_count": 0 + }, + { + "conflict_detection_count": 0, + "encoded_job_count": 0, + "expected_evidence_recall": null, + "history_readback_encoded_count": 0, + "irrelevant_context_ratio": null, + "reason": "No checked-in real_world_job fixture is encoded for this suite.", + "score_mean": null, + "stale_answer_count": 0, + "status": "not_encoded", + "suite_id": "proactive_brief", + "temporal_validity_not_encoded_count": 0, + "trace_explainability_count": 0, + "unsupported_claim_count": 0, + "update_rationale_available_count": 0, + "wrong_result_count": 0 + }, + { + "conflict_detection_count": 0, + "encoded_job_count": 0, + "expected_evidence_recall": null, + "history_readback_encoded_count": 0, + "irrelevant_context_ratio": null, + "reason": "No checked-in real_world_job fixture is encoded for this suite.", + "score_mean": null, + "stale_answer_count": 0, + "status": "not_encoded", + "suite_id": "scheduled_memory", + "temporal_validity_not_encoded_count": 0, + "trace_explainability_count": 0, + "unsupported_claim_count": 0, + "update_rationale_available_count": 0, + "wrong_result_count": 0 + }, + { + "conflict_detection_count": 0, + "encoded_job_count": 0, + "expected_evidence_recall": null, + "history_readback_encoded_count": 0, + "irrelevant_context_ratio": null, + "reason": "No checked-in real_world_job fixture is encoded for this suite.", + "score_mean": null, + "stale_answer_count": 0, + "status": "not_encoded", + "suite_id": "knowledge_compilation", + "temporal_validity_not_encoded_count": 0, + "trace_explainability_count": 0, + "unsupported_claim_count": 0, + "update_rationale_available_count": 0, + "wrong_result_count": 0 + }, + { + "conflict_detection_count": 0, + "encoded_job_count": 0, + "expected_evidence_recall": null, + "history_readback_encoded_count": 0, + "irrelevant_context_ratio": null, + "reason": "No checked-in real_world_job fixture is encoded for this suite.", + "score_mean": null, + "stale_answer_count": 0, + "status": "not_encoded", + "suite_id": "operator_debugging_ux", + "temporal_validity_not_encoded_count": 0, + "trace_explainability_count": 0, + "unsupported_claim_count": 0, + "update_rationale_available_count": 0, + "wrong_result_count": 0 + }, + { + "conflict_detection_count": 0, + "encoded_job_count": 0, + "expected_evidence_recall": null, + "history_readback_encoded_count": 0, + "irrelevant_context_ratio": null, + "reason": "No checked-in real_world_job fixture is encoded for this suite.", + "score_mean": null, + "stale_answer_count": 0, + "status": "not_encoded", + "suite_id": "capture_integration", + "temporal_validity_not_encoded_count": 0, + "trace_explainability_count": 0, + "unsupported_claim_count": 0, + "update_rationale_available_count": 0, + "wrong_result_count": 0 + }, + { + "conflict_detection_count": 0, + "encoded_job_count": 0, + "expected_evidence_recall": null, + "history_readback_encoded_count": 0, + "irrelevant_context_ratio": null, + "reason": "No checked-in real_world_job fixture is encoded for this suite.", + "score_mean": null, + "stale_answer_count": 0, + "status": "not_encoded", + "suite_id": "production_ops", + "temporal_validity_not_encoded_count": 0, + "trace_explainability_count": 0, + "unsupported_claim_count": 0, + "update_rationale_available_count": 0, + "wrong_result_count": 0 + }, + { + "conflict_detection_count": 0, + "encoded_job_count": 0, + "expected_evidence_recall": null, + "history_readback_encoded_count": 0, + "irrelevant_context_ratio": null, + "reason": "No checked-in real_world_job fixture is encoded for this suite.", + "score_mean": null, + "stale_answer_count": 0, + "status": "not_encoded", + "suite_id": "personalization", + "temporal_validity_not_encoded_count": 0, + "trace_explainability_count": 0, + "unsupported_claim_count": 0, + "update_rationale_available_count": 0, + "wrong_result_count": 0 + }, + { + "conflict_detection_count": 0, + "encoded_job_count": 6, + "expected_evidence_recall": 1.0, + "history_readback_encoded_count": 0, + "irrelevant_context_ratio": 0.0, + "reason": "At least one encoded job is blocked.", + "score_mean": 0.0, + "stale_answer_count": 0, + "status": "blocked", + "suite_id": "core_archival_memory", + "temporal_validity_not_encoded_count": 0, + "trace_explainability_count": 0, + "unsupported_claim_count": 0, + "update_rationale_available_count": 0, + "wrong_result_count": 0 + }, + { + "conflict_detection_count": 0, + "encoded_job_count": 0, + "expected_evidence_recall": null, + "history_readback_encoded_count": 0, + "irrelevant_context_ratio": null, + "reason": "No checked-in real_world_job fixture is encoded for this suite.", + "score_mean": null, + "stale_answer_count": 0, + "status": "not_encoded", + "suite_id": "context_trajectory", + "temporal_validity_not_encoded_count": 0, + "trace_explainability_count": 0, + "unsupported_claim_count": 0, + "update_rationale_available_count": 0, + "wrong_result_count": 0 + } + ], + "summary": { + "blocked": 6, + "conflict_detection_count": 0, + "consolidation": { + "executable_gap_count": 0, + "lineage_completeness": null, + "proposal_count": 0, + "proposal_unsupported_claim_count": 0, + "proposal_usefulness": null, + "review_action_correctness": null, + "source_mutation_count": 0 + }, + "encoded_suite_count": 1, + "evidence_coverage": 1.0, + "evidence_covered_count": 14, + "evidence_required_count": 14, + "expected_evidence_matched": 14, + "expected_evidence_recall": 1.0, + "expected_evidence_total": 14, + "history_readback_encoded_count": 0, + "incomplete": 0, + "irrelevant_context_count": 0, + "irrelevant_context_ratio": 0.0, + "job_count": 6, + "lifecycle_fail": 0, + "mean_latency_ms": 0.0, + "mean_score": 0.0, + "not_encoded": 0, + "operator_debug_job_count": 0, + "operator_ux_gap_count": 0, + "pass": 0, + "qdrant_rebuild_case_count": 0, + "qdrant_rebuild_pass_count": 0, + "quote_coverage": 1.0, + "quote_covered_count": 14, + "quote_required_count": 14, + "raw_sql_needed_count": 0, + "redaction_leak_count": 0, + "scope_check_count": 1, + "scope_correct_count": 1, + "scope_correctness": 1.0, + "scope_violation_count": 0, + "source_ref_coverage": 1.0, + "source_ref_covered_count": 14, + "source_ref_required_count": 14, + "stale_answer_count": 0, + "stale_retrieval_count": 0, + "temporal_validity_not_encoded_count": 0, + "total_cost": { + "amount": 0.0, + "currency": "USD", + "input_tokens": 0, + "output_tokens": 0 + }, + "trace_explainability_count": 0, + "trace_incomplete_count": 0, + "unsupported_claim": 0, + "unsupported_claim_count": 0, + "update_rationale_available_count": 0, + "wrong_result": 0, + "wrong_result_count": 0, + "wrong_result_stage_attribution_count": 0 + } + }, + "schema": "elf.letta_core_archive_export_readback_summary/v1", + "scored_benchmark": { + "counts": { + "blocked": 6, + "incomplete": 0, + "lifecycle_fail": 0, + "not_encoded": 0, + "pass": 0, + "wrong_result": 0 + }, + "evidence_coverage": 1.0, + "job_count": 6, + "mean_score": 0.0, + "schema": "elf.scored_benchmark_status/v1", + "source": "real_world_job_benchmark", + "status": "blocked" + }, + "status_boundary": { + "manifest": "external adapter declaration consumed by the scorer", + "materialization": "setup/run/evidence-mapping state emitted by the smoke runner", + "scored_benchmark": "post-score real_world_job outcome; use this for quality status" + } +} diff --git a/apps/elf-eval/tests/real_world_job_benchmark.rs b/apps/elf-eval/tests/real_world_job_benchmark.rs index 982aa8a7..03c23feb 100644 --- a/apps/elf-eval/tests/real_world_job_benchmark.rs +++ b/apps/elf-eval/tests/real_world_job_benchmark.rs @@ -234,6 +234,10 @@ fn openviking_trajectory_materialization_report_json_path() -> Result { report_snapshot_path("2026-06-19-openviking-trajectory-materialization-report.json") } +fn letta_core_archive_export_readback_report_json_path() -> Result { + report_snapshot_path("2026-06-19-letta-core-archive-export-readback-report.json") +} + fn openviking_trajectory_materialization_report_markdown_path() -> Result { Ok(workspace_root()? .join("docs") @@ -242,6 +246,14 @@ fn openviking_trajectory_materialization_report_markdown_path() -> Result Result { + Ok(workspace_root()? + .join("docs") + .join("evidence") + .join("benchmarking") + .join("2026-06-19-letta-core-archive-export-readback-report.md")) +} + fn live_temporal_reconciliation_report_json_path() -> Result { report_snapshot_path("2026-06-16-live-temporal-reconciliation-report.json") } @@ -602,7 +614,7 @@ fn external_adapter_run_summarizes_nonzero_scenario_losses() -> Result<()> { report .pointer("/external_adapters/summary/scenario_outcome_counts/not_tested") .and_then(Value::as_u64), - Some(16) + Some(12) ); let adapters = array_at(&report, "/external_adapters/adapters")?; @@ -721,7 +733,7 @@ fn assert_external_adapter_manifest_status_summary(report: &Value) { report .pointer("/external_adapters/summary/suite_status_counts/blocked") .and_then(Value::as_u64), - Some(23) + Some(24) ); assert_eq!( report @@ -739,7 +751,7 @@ fn assert_external_adapter_manifest_status_summary(report: &Value) { report .pointer("/external_adapters/summary/suite_status_counts/not_encoded") .and_then(Value::as_u64), - Some(38) + Some(37) ); } @@ -766,7 +778,7 @@ fn assert_external_adapter_manifest_scenario_summary(report: &Value) { report .pointer("/external_adapters/summary/scenario_status_counts/blocked") .and_then(Value::as_u64), - Some(12) + Some(16) ); assert_eq!( report @@ -796,7 +808,7 @@ fn assert_external_adapter_manifest_scenario_summary(report: &Value) { report .pointer("/external_adapters/summary/scenario_status_counts/not_encoded") .and_then(Value::as_u64), - Some(11) + Some(7) ); assert_eq!( report @@ -844,13 +856,13 @@ fn assert_external_adapter_manifest_scenario_summary(report: &Value) { report .pointer("/external_adapters/summary/scenario_outcome_counts/not_tested") .and_then(Value::as_u64), - Some(17) + Some(13) ); assert_eq!( report .pointer("/external_adapters/summary/scenario_outcome_counts/blocked") .and_then(Value::as_u64), - Some(13) + Some(17) ); assert_eq!( report @@ -994,16 +1006,33 @@ fn assert_letta_core_archival_gate(adapter: &Value) -> Result<()> { adapter .pointer("/setup/evidence") .and_then(Value::as_str) - .is_some_and(|evidence| evidence.contains("Docker-only benchmark-created agent export")) + .is_some_and(|evidence| evidence.contains("smoke-letta-core-archive-export-readback") + && evidence.contains("Docker-only benchmark-created agent export/readback")) + ); + assert_eq!( + adapter.pointer("/setup/command").and_then(Value::as_str), + Some("cargo make smoke-letta-core-archive-export-readback") + ); + assert_eq!( + adapter.pointer("/run/command").and_then(Value::as_str), + Some( + "ELF_LETTA_SMOKE_START=1 ELF_LETTA_SMOKE_RUN=1 cargo make smoke-letta-core-archive-export-readback" + ) ); assert!(adapter.pointer("/execution_metadata/setup_path").and_then(Value::as_str).is_some_and( |setup| setup.contains("exports core block JSON plus archival search/readback JSON") + && setup.contains("typed artifact") )); let suites = array_at(adapter, "/suites")?; let core_suite = find_by_field(suites, "/suite_id", "core_archival_memory")?; assert_eq!(core_suite.pointer("/status").and_then(Value::as_str), Some("blocked")); + assert_eq!( + adapter.pointer("/capabilities/2/capability").and_then(Value::as_str), + Some("real_world_job_adapter") + ); + assert_eq!(adapter.pointer("/capabilities/2/status").and_then(Value::as_str), Some("blocked")); let scenarios = array_at(adapter, "/scenarios")?; let attachment = find_by_field(scenarios, "/scenario_id", "core_block_attachment_readback")?; @@ -1017,21 +1046,23 @@ fn assert_letta_core_archival_gate(adapter: &Value) -> Result<()> { assert_eq!(scenarios.len(), 6); for scenario in [attachment, scope, provenance, stale, fallback, decision] { + assert_eq!(scenario.pointer("/status").and_then(Value::as_str), Some("blocked")); assert_eq!(scenario.pointer("/elf_position").and_then(Value::as_str), Some("untested")); - assert!( - ["not_tested", "blocked"].contains( - &scenario - .pointer("/comparison_outcome") - .and_then(Value::as_str) - .ok_or_else(|| eyre::eyre!("missing Letta comparison_outcome"))? - ) + assert_eq!( + scenario.pointer("/comparison_outcome").and_then(Value::as_str), + Some("blocked") + ); + assert_eq!( + scenario.pointer("/command").and_then(Value::as_str), + Some("cargo make smoke-letta-core-archive-export-readback") + ); + assert_eq!( + scenario.pointer("/artifact").and_then(Value::as_str), + Some("tmp/real-world-memory/letta-core-archive/summary.json") ); } - assert_eq!( - attachment.pointer("/comparison_outcome").and_then(Value::as_str), - Some("not_tested") - ); + assert_eq!(attachment.pointer("/comparison_outcome").and_then(Value::as_str), Some("blocked")); assert_eq!(stale.pointer("/comparison_outcome").and_then(Value::as_str), Some("blocked")); assert_eq!(fallback.pointer("/comparison_outcome").and_then(Value::as_str), Some("blocked")); @@ -1817,7 +1848,11 @@ fn operator_debug_live_adapter_task_is_docker_scoped() -> Result<()> { fn external_adapter_manifest_rejects_unmeasured_win_loss_scenario_outcomes() -> Result<()> { let output = run_external_manifest_with_letta_attachment_mutation( "invalid-scenario-outcome-test", - |scenario| set_json_pointer(scenario, "/comparison_outcome", serde_json::json!("win")), + |scenario| { + set_json_pointer(scenario, "/status", serde_json::json!("not_encoded"))?; + + set_json_pointer(scenario, "/comparison_outcome", serde_json::json!("win")) + }, )?; assert!(!output.status.success(), "invalid scenario outcome unexpectedly passed"); @@ -3097,6 +3132,104 @@ fn openviking_trajectory_materialization_report_preserves_blocked_gates() -> Res Ok(()) } +#[test] +fn letta_core_archive_export_readback_report_preserves_blocked_gates() -> Result<()> { + let report = serde_json::from_str::(&fs::read_to_string( + letta_core_archive_export_readback_report_json_path()?, + )?)?; + let markdown = fs::read_to_string(letta_core_archive_export_readback_report_markdown_path()?)?; + let benchmarking_index = fs::read_to_string(benchmarking_index_path()?)?; + let readme = fs::read_to_string(readme_path()?)?; + + assert_eq!( + report.pointer("/schema").and_then(Value::as_str), + Some("elf.letta_core_archive_export_readback_summary/v1") + ); + assert_eq!( + report.pointer("/adapter_id").and_then(Value::as_str), + Some("letta_core_archive_export_readback") + ); + assert_eq!( + report.pointer("/materialization/status/failure_class").and_then(Value::as_str), + Some("letta_live_run_disabled") + ); + assert_eq!( + report.pointer("/materialization/status/overall").and_then(Value::as_str), + Some("blocked") + ); + assert_eq!( + report.pointer("/materialization/scored_benchmark/status").and_then(Value::as_str), + Some("blocked") + ); + assert_eq!( + report.pointer("/materialization/scored_benchmark/counts/blocked").and_then(Value::as_u64), + Some(6) + ); + assert_eq!( + report.pointer("/materialization/scored_benchmark/counts/pass").and_then(Value::as_u64), + Some(0) + ); + assert_eq!( + report + .pointer("/materialization/scored_benchmark/counts/wrong_result") + .and_then(Value::as_u64), + Some(0) + ); + assert_eq!( + report + .pointer("/materialization/scored_benchmark/evidence_coverage") + .and_then(Value::as_f64), + Some(1.0) + ); + assert_eq!( + report + .pointer("/materialization/benchmark_input/core_blocks") + .and_then(Value::as_array) + .map(Vec::len), + Some(9) + ); + assert_eq!( + report + .pointer("/materialization/benchmark_input/archival_passages") + .and_then(Value::as_array) + .map(Vec::len), + Some(6) + ); + assert_eq!( + report + .pointer("/materialization/evidence_mapping/expected_evidence_ids") + .and_then(Value::as_array) + .map(Vec::len), + Some(14) + ); + assert_eq!( + report + .pointer("/materialization/evidence_mapping/mapped_evidence_ids") + .and_then(Value::as_array) + .map(Vec::len), + Some(0) + ); + assert_eq!( + report + .pointer("/materialization/improvement_regression_readback/judgment") + .and_then(Value::as_str), + Some("unchanged") + ); + assert!(array_contains_str( + &report, + "/materialization/claim_boundaries/not_allowed", + "Do not claim ELF beats Letta on core-vs-archival memory from fixture-only ELF evidence." + )?); + assert!(markdown.contains("The Letta follow-up is now reproducible")); + assert!(markdown.contains("6 typed blocked")); + assert!(markdown.contains("competitive status is unchanged")); + assert!(benchmarking_index.contains("2026-06-19-letta-core-archive-export-readback-report.md")); + assert!(readme.contains("Letta core/archive materialization after XY-984")); + assert!(readme.contains("smoke-letta-core-archive-export-readback")); + + Ok(()) +} + fn assert_openviking_trajectory_materialization_summary(report: &Value) -> Result<()> { assert_eq!( report.pointer("/schema").and_then(Value::as_str), @@ -4381,7 +4514,7 @@ fn generated_json_report_renders_markdown() -> Result<()> { assert!(markdown.contains("### Adapter Scenario Judgments")); assert!(markdown.contains("ELF scenario positions: `wins=10, ties=11, loses=1, untested=35`")); assert!(markdown.contains( - "Scenario comparison outcomes: `win=10, tie=11, loss=1, not_tested=17, blocked=13, non_goal=5`" + "Scenario comparison outcomes: `win=10, tie=11, loss=1, not_tested=13, blocked=17, non_goal=5`" )); assert!(markdown.contains("| `claude_mem_live_baseline` | `same_corpus_retrieval`")); assert!(markdown.contains("| `memsearch_live_baseline` | `ttl_expiry_lifecycle`")); diff --git a/docker-compose.baseline.yml b/docker-compose.baseline.yml index 5dc3180e..30d26a89 100644 --- a/docker-compose.baseline.yml +++ b/docker-compose.baseline.yml @@ -79,6 +79,19 @@ services: volumes: - elf-live-baseline-graphiti-falkordb:/data + letta: + profiles: + - letta + image: ${ELF_LETTA_IMAGE:-letta/letta:latest} + environment: + OPENAI_API_BASE: ${ELF_LETTA_OPENAI_API_BASE:-} + OPENAI_API_KEY: ${ELF_LETTA_OPENAI_API_KEY:-} + ANTHROPIC_API_KEY: ${ELF_LETTA_ANTHROPIC_API_KEY:-} + OLLAMA_BASE_URL: ${ELF_LETTA_OLLAMA_BASE_URL:-} + LETTA_MEMFS_SERVICE_URL: ${ELF_LETTA_MEMFS_SERVICE_URL:-} + volumes: + - elf-live-baseline-letta-pgdata:/var/lib/postgresql/data + baseline-runner: build: context: . @@ -160,6 +173,7 @@ volumes: elf-live-baseline-cargo-registry: elf-live-baseline-graphiti-falkordb: elf-live-baseline-huggingface-cache: + elf-live-baseline-letta-pgdata: elf-live-baseline-lightrag-inputs: elf-live-baseline-lightrag-prompts: elf-live-baseline-lightrag-rag-storage: diff --git a/docs/evidence/benchmarking/2026-06-19-letta-core-archive-export-readback-report.md b/docs/evidence/benchmarking/2026-06-19-letta-core-archive-export-readback-report.md new file mode 100644 index 00000000..4ff09f82 --- /dev/null +++ b/docs/evidence/benchmarking/2026-06-19-letta-core-archive-export-readback-report.md @@ -0,0 +1,113 @@ +--- +type: Evidence +title: "Letta Core/Archive Export-Readback Report - June 19, 2026" +description: "Checked-in benchmark evidence record: Letta Core/Archive Export-Readback Report - June 19, 2026." +resource: docs/evidence/benchmarking/2026-06-19-letta-core-archive-export-readback-report.md +status: active +authority: current_state +owner: evidence +last_verified: 2026-06-19 +tags: + - docs + - evidence + - benchmarking +--- +# Letta Core/Archive Export-Readback Report - June 19, 2026 + +Goal: Close the XY-984 materialization gap by adding a Docker-contained Letta +core/archive export-readback benchmark surface without changing ELF product +behavior or claiming ELF-over-Letta superiority. +Read this when: You need to know whether the Letta core-vs-archival comparison +blocker from the Dreaming competitor-strength retest was removed. +Inputs: +`apps/elf-eval/fixtures/report_snapshots/2026-06-19-letta-core-archive-export-readback-report.json`, +`apps/elf-eval/fixtures/real_world_memory/core_archival_memory/`, +`docs/evidence/benchmarking/2026-06-17-dreaming-competitor-strength-retest-report.md`, +and `docs/evidence/benchmarking/2026-06-16-scheduled-memory-task-scoring-report.md`. +Outputs: A Docker-contained materialization command, a generated Letta export/readback +artifact contract, and a scored typed-blocked report over the six core/archive jobs. + +## Executive Judgment + +The Letta follow-up is now reproducible as a benchmark materialization command, but +the competitive status is unchanged. + +`cargo make smoke-letta-core-archive-export-readback` runs inside the baseline +Docker runner and publishes: + +- 6 core/archive jobs. +- 0 pass. +- 0 wrong_result. +- 6 typed blocked. +- 14/14 evidence coverage. +- 14/14 source-ref coverage. +- 14/14 quote coverage. + +This improves the audit trail relative to XY-955 because the Letta comparison now +has an executable materialization/report path. It does not remove the Letta blocker: +the default run intentionally does not start a live Letta server or use provider +credentials, so it records `letta_live_run_disabled` and preserves the comparison as +blocked until exported Letta core block JSON, archival readback/search JSON, and +fixture source ids exist. + +## What Changed + +- Added `cargo make smoke-letta-core-archive-export-readback`. +- Added `scripts/letta-core-archive-export-readback-smoke.py`. +- Added an optional `letta` Docker Compose profile. +- Updated the external adapter manifest so Letta is no longer recorded as lacking a + materializer; it is recorded as materialized but still blocked by missing live + export/readback source-id evidence. +- Checked in the JSON companion at + `apps/elf-eval/fixtures/report_snapshots/2026-06-19-letta-core-archive-export-readback-report.json`. + +## Scenario Status + +| Scenario | Current Status | Judgment | +| --- | --- | --- | +| Core block attachment readback | `blocked` | Materialized typed blocker; no Letta pass/tie/loss claim. | +| Core block scope readback | `blocked` | Materialized typed blocker; no visibility claim without export metadata. | +| Core block provenance readback | `blocked` | Materialized typed blocker; no provenance claim without source-id export. | +| Stale core detection | `blocked` | Still blocked until core export joins archival supersession evidence. | +| Archival fallback readback | `blocked` | Still blocked until archival search/readback maps fallback source ids. | +| Core/archive project-decision recovery | `blocked` | Still blocked until core routing plus archival rationale source ids are exported. | + +## Improvement/Regression Readback + +- Improved: reproducibility and auditability. The comparison now has a Docker-owned + command and durable JSON snapshot instead of only a research-gate note. +- Unchanged: competitive status. Letta remains blocked on live exported core/archive + evidence, so there is no ELF win, tie, or loss. +- No regression: the existing ELF `core_archival_memory` fixture pass remains + separate from Letta comparison scoring. + +## Claim Boundaries + +Allowed: + +- The Letta comparison has a reproducible Docker-contained materialization/report + command. +- The current default run preserves typed blockers when live Letta/provider setup + cannot produce export/readback evidence. + +Not allowed: + +- Do not claim ELF beats Letta on core-vs-archival memory from fixture-only ELF + evidence. +- Do not score Letta pass, win, tie, or loss unless exported core block JSON, + archival readback/search JSON, and fixture source ids are present. + +## Next Optimization Direction + +The next live attempt should run: + +`ELF_LETTA_SMOKE_START=1 ELF_LETTA_SMOKE_RUN=1 cargo make smoke-letta-core-archive-export-readback` + +Required fields before scoring can move beyond blocked: + +- exported core block JSON with fixture source ids, +- archival passage list/readback JSON with fixture source ids, +- archival search JSON for required evidence ids, +- model and embedding configuration, +- Docker-local agent/storage boundary, +- audit-equivalent metadata for source-id provenance. diff --git a/docs/evidence/benchmarking/index.md b/docs/evidence/benchmarking/index.md index 836a6f2e..47a31d1a 100644 --- a/docs/evidence/benchmarking/index.md +++ b/docs/evidence/benchmarking/index.md @@ -36,5 +36,6 @@ Routes to: Benchmarking evidence concepts under `docs/evidence/benchmarking/`. - `2026-06-16-proactive-brief-scoring-report.md`: Proactive Brief Scoring Report - June 16, 2026. - `2026-06-16-scheduled-memory-task-scoring-report.md`: Real-World Job Benchmark Report. - `2026-06-17-dreaming-competitor-strength-retest-report.md`: Dreaming Competitor-Strength Retest Report - June 17, 2026. +- `2026-06-19-letta-core-archive-export-readback-report.md`: Letta Core/Archive Export-Readback Report - June 19, 2026; adds a Docker-contained Letta materialization/report command while preserving all six core/archive comparison scenarios as typed blockers until exported core block JSON, archival readback/search JSON, and source ids exist. - `2026-06-19-openviking-trajectory-materialization-report.md`: OpenViking Trajectory Materialization Report - June 19, 2026; materializes the context-trajectory fixture slice through a dedicated repo task while preserving staged retrieval, hierarchy selection, and recursive/context expansion as typed blockers. - `2026-06-19-qmd-debug-ergonomics-dreaming-retest-report.md`: qmd Debug-Ergonomics Dreaming Retest Report - June 19, 2026; confirms qmd's default top-k/replay edge is unchanged while ELF keeps the narrow operator-debug trace/stage visibility wins. diff --git a/docs/log.md b/docs/log.md index a56ef411..fa379d2a 100644 --- a/docs/log.md +++ b/docs/log.md @@ -41,3 +41,7 @@ logs. - Added `cargo make real-world-memory-context-trajectory` as the reproducible context-trajectory benchmark entrypoint and linked the new report from the benchmarking evidence index and README. +- Added the Letta core/archive export-readback materialization report and snapshot + for XY-984, plus `cargo make smoke-letta-core-archive-export-readback`, preserving + all six Letta comparison scenarios as typed blockers until exported core block JSON, + archival readback/search JSON, and fixture source ids exist. diff --git a/scripts/letta-core-archive-export-readback-smoke.py b/scripts/letta-core-archive-export-readback-smoke.py new file mode 100755 index 00000000..ee31ffdc --- /dev/null +++ b/scripts/letta-core-archive-export-readback-smoke.py @@ -0,0 +1,1062 @@ +#!/usr/bin/env python3 +"""Docker-contained Letta core/archive export-readback smoke.""" + +from __future__ import annotations + +import json +import os +import shutil +import subprocess +import sys +import time +import urllib.error +import urllib.request +from dataclasses import dataclass +from datetime import datetime, timezone +from pathlib import Path +from typing import Any + + +SCRIPT_DIR = Path(__file__).resolve().parent +ROOT_DIR = SCRIPT_DIR.parent +CORE_FIXTURE_DIR = ROOT_DIR / "apps" / "elf-eval" / "fixtures" / "real_world_memory" / "core_archival_memory" +REPORT_DIR = Path( + os.environ.get( + "ELF_LETTA_SMOKE_REPORT_DIR", + ROOT_DIR / "tmp" / "real-world-memory" / "letta-core-archive", + ) +) +WORK_DIR = Path(os.environ.get("ELF_LETTA_SMOKE_WORK_DIR", REPORT_DIR / "work")) +OUT = Path(os.environ.get("ELF_LETTA_SMOKE_OUT", REPORT_DIR / "letta-core-archive-export.json")) +MANIFEST_OUT = Path( + os.environ.get( + "ELF_LETTA_SMOKE_MANIFEST_OUT", + REPORT_DIR / "memory_projects_manifest.letta-core-archive.json", + ) +) +SUMMARY_OUT = Path(os.environ.get("ELF_LETTA_SMOKE_SUMMARY_OUT", REPORT_DIR / "summary.json")) +REPORT_JSON = Path(os.environ.get("ELF_LETTA_SMOKE_REPORT_JSON", REPORT_DIR / "report.json")) +REPORT_MD = Path(os.environ.get("ELF_LETTA_SMOKE_REPORT_MD", REPORT_DIR / "report.md")) +FIXTURE_DIR = REPORT_DIR / "letta-fixtures" +LOG_DIR = REPORT_DIR / "logs" + +RUN_ID = os.environ.get( + "ELF_LETTA_SMOKE_RUN_ID", + f"letta-core-archive-{datetime.now(timezone.utc).strftime('%Y%m%d%H%M%S')}", +) +RUN_LIVE = os.environ.get("ELF_LETTA_SMOKE_RUN", "0") == "1" +ALLOW_HOST = os.environ.get("ELF_LETTA_SMOKE_ALLOW_HOST", "0") == "1" +INSTALL_CLIENT = os.environ.get("ELF_LETTA_SMOKE_INSTALL_CLIENT", "1") == "1" +LETTA_BASE_URL = os.environ.get("ELF_LETTA_BASE_URL", "http://letta:8283") +LETTA_CLIENT_PACKAGE = os.environ.get("ELF_LETTA_CLIENT_PACKAGE", "letta-client") +LETTA_CLIENT_REF = os.environ.get("ELF_LETTA_CLIENT_REF", f"pypi:{LETTA_CLIENT_PACKAGE}") +LETTA_MODEL = os.environ.get("ELF_LETTA_MODEL", "openai/gpt-4o-mini") +LETTA_EMBEDDING = os.environ.get("ELF_LETTA_EMBEDDING", "openai/text-embedding-3-small") +TIMEOUT_SECONDS = int(os.environ.get("ELF_LETTA_TIMEOUT_SECONDS", "600")) +STARTUP_ATTEMPTS = int(os.environ.get("ELF_LETTA_STARTUP_ATTEMPTS", "30")) +STARTUP_INTERVAL_SECONDS = float(os.environ.get("ELF_LETTA_STARTUP_INTERVAL_SECONDS", "2")) + +CORE_KINDS = {"core_block", "core_block_contract", "core_block_event"} + + +@dataclass +class StatusState: + """Typed status for generated Letta smoke artifacts.""" + + setup: str = "blocked" + run: str = "not_encoded" + result: str = "blocked" + overall: str = "blocked" + evidence_class: str = "research_gate" + failure_class: str = "letta_live_run_disabled" + failure_reason: str = ( + "Letta live export/readback is disabled by default; run " + "ELF_LETTA_SMOKE_START=1 ELF_LETTA_SMOKE_RUN=1 cargo make " + "smoke-letta-core-archive-export-readback with explicit Docker/provider configuration." + ) + + +@dataclass +class CommandRecord: + """Captured command result without secret-bearing environment values.""" + + label: str + command: list[str] + status: str + elapsed_ms: float + stdout_artifact: str | None + stderr_artifact: str | None + returncode: int | None + reason: str + + +def utc_now() -> str: + """Return an RFC3339 UTC timestamp.""" + + return datetime.now(timezone.utc).replace(microsecond=0).isoformat().replace("+00:00", "Z") + + +def rel(path: Path) -> str: + """Return a repository-relative path when possible.""" + + try: + return str(path.resolve().relative_to(ROOT_DIR)) + except ValueError: + return str(path) + + +def mkdirs() -> None: + """Create and reset output directories owned by this smoke.""" + + for path in (FIXTURE_DIR, LOG_DIR): + if path.exists(): + shutil.rmtree(path) + + for path in (REPORT_DIR, WORK_DIR, FIXTURE_DIR, LOG_DIR): + path.mkdir(parents=True, exist_ok=True) + + for path in (OUT, MANIFEST_OUT, SUMMARY_OUT, REPORT_JSON, REPORT_MD): + if path.exists(): + path.unlink() + + +def write_json(path: Path, payload: Any) -> None: + """Write stable, pretty JSON.""" + + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(json.dumps(payload, indent=2, sort_keys=True) + "\n", encoding="utf-8") + + +def command_available(name: str) -> bool: + """Return whether a command is available.""" + + return shutil.which(name) is not None + + +def run_command( + label: str, + command: list[str], + cwd: Path, + *, + extra_env: dict[str, str] | None = None, +) -> CommandRecord: + """Run a command and capture stdout/stderr artifacts.""" + + started = time.monotonic() + env = os.environ.copy() + if extra_env: + env.update(extra_env) + + try: + result = subprocess.run( + command, + cwd=cwd, + env=env, + text=True, + capture_output=True, + timeout=TIMEOUT_SECONDS, + check=False, + ) + elapsed = (time.monotonic() - started) * 1000 + stdout_path = LOG_DIR / f"{label}.stdout.txt" + stderr_path = LOG_DIR / f"{label}.stderr.txt" + stdout_path.write_text(result.stdout, encoding="utf-8") + stderr_path.write_text(result.stderr, encoding="utf-8") + status = "pass" if result.returncode == 0 else "incomplete" + reason = "command completed" if result.returncode == 0 else f"exit code {result.returncode}" + + return CommandRecord( + label=label, + command=command, + status=status, + elapsed_ms=elapsed, + stdout_artifact=rel(stdout_path), + stderr_artifact=rel(stderr_path), + returncode=result.returncode, + reason=reason, + ) + except subprocess.TimeoutExpired as exc: + elapsed = (time.monotonic() - started) * 1000 + stdout_path = LOG_DIR / f"{label}.stdout.txt" + stderr_path = LOG_DIR / f"{label}.stderr.txt" + stdout_path.write_text(exc.stdout or "", encoding="utf-8") + stderr_path.write_text(exc.stderr or "", encoding="utf-8") + + return CommandRecord( + label=label, + command=command, + status="incomplete", + elapsed_ms=elapsed, + stdout_artifact=rel(stdout_path), + stderr_artifact=rel(stderr_path), + returncode=None, + reason=f"timed out after {TIMEOUT_SECONDS}s", + ) + + +def command_to_json(record: CommandRecord) -> dict[str, Any]: + """Serialize a command record.""" + + return { + "label": record.label, + "command": record.command, + "status": record.status, + "elapsed_ms": round(record.elapsed_ms, 3), + "stdout_artifact": record.stdout_artifact, + "stderr_artifact": record.stderr_artifact, + "returncode": record.returncode, + "reason": record.reason, + } + + +def load_source_fixtures() -> list[dict[str, Any]]: + """Load the checked-in core_archival_memory fixture corpus.""" + + fixtures = [] + for path in sorted(CORE_FIXTURE_DIR.glob("*.json")): + payload = json.loads(path.read_text(encoding="utf-8")) + payload["_source_path"] = rel(path) + fixtures.append(payload) + + return fixtures + + +def evidence_ids_for_fixture(fixture: dict[str, Any]) -> list[str]: + """Return required evidence ids for one fixture.""" + + return [ + item["evidence_id"] + for item in fixture.get("required_evidence", []) + if isinstance(item, dict) and item.get("evidence_id") + ] + + +def all_required_evidence_ids(fixtures: list[dict[str, Any]]) -> list[str]: + """Return de-duplicated required evidence ids.""" + + ids: list[str] = [] + for fixture in fixtures: + for evidence_id in evidence_ids_for_fixture(fixture): + if evidence_id not in ids: + ids.append(evidence_id) + + return ids + + +def source_items(fixtures: list[dict[str, Any]]) -> list[dict[str, Any]]: + """Flatten fixture corpus items with job metadata.""" + + items = [] + for fixture in fixtures: + for item in fixture.get("corpus", {}).get("items", []): + item_copy = dict(item) + item_copy["job_id"] = fixture["job_id"] + item_copy["fixture_source"] = fixture["_source_path"] + items.append(item_copy) + + return items + + +def benchmark_input_contract(fixtures: list[dict[str, Any]]) -> dict[str, Any]: + """Return the benchmark-owned Letta input contract.""" + + core_blocks = [] + archival_passages = [] + for item in source_items(fixtures): + record = { + "source_id": item["evidence_id"], + "job_id": item["job_id"], + "kind": item.get("kind"), + "text": item.get("text", ""), + "fixture_source": item["fixture_source"], + } + if item.get("kind") in CORE_KINDS: + core_blocks.append( + { + "label": slug(item["evidence_id"])[:48], + "value": f"Source ID: {item['evidence_id']}\n{item.get('text', '')}", + **record, + } + ) + elif item.get("kind") not in {"stale_claim", "unsupported_claim"}: + archival_passages.append( + { + "text": f"Source ID: {item['evidence_id']}\n{item.get('text', '')}", + **record, + } + ) + + return { + "core_blocks": core_blocks, + "archival_passages": archival_passages, + "source_id_count": len({item["evidence_id"] for item in source_items(fixtures)}), + "required_evidence_ids": all_required_evidence_ids(fixtures), + } + + +def slug(value: str) -> str: + """Return a small ASCII slug.""" + + out: list[str] = [] + last_dash = False + + for char in value.lower(): + if char.isascii() and char.isalnum(): + out.append(char) + last_dash = False + elif not last_dash and out: + out.append("-") + last_dash = True + + while out and out[-1] == "-": + out.pop() + + return "".join(out) or "item" + + +def wait_for_letta(command_records: list[CommandRecord]) -> bool: + """Wait for a Letta server endpoint to become reachable.""" + + started = time.monotonic() + probes = ["/v1/health", "/health", "/v1/models"] + last_reason = "not attempted" + for _ in range(STARTUP_ATTEMPTS): + for path in probes: + url = LETTA_BASE_URL.rstrip("/") + path + try: + with urllib.request.urlopen(url, timeout=5) as response: + if 200 <= response.status < 500: + command_records.append( + CommandRecord( + label="letta-health-probe", + command=["GET", url], + status="pass", + elapsed_ms=(time.monotonic() - started) * 1000, + stdout_artifact=None, + stderr_artifact=None, + returncode=0, + reason=f"reachable via {path}", + ) + ) + return True + except (urllib.error.URLError, TimeoutError, OSError) as exc: + last_reason = str(exc) + + time.sleep(STARTUP_INTERVAL_SECONDS) + + command_records.append( + CommandRecord( + label="letta-health-probe", + command=["GET", LETTA_BASE_URL.rstrip() + "/v1/health"], + status="incomplete", + elapsed_ms=(time.monotonic() - started) * 1000, + stdout_artifact=None, + stderr_artifact=None, + returncode=None, + reason=last_reason, + ) + ) + return False + + +def init_letta_client(command_records: list[CommandRecord]) -> bool: + """Install or verify the Letta Python client.""" + + if INSTALL_CLIENT: + record = run_command( + "letta-client-install", + [sys.executable, "-m", "pip", "install", LETTA_CLIENT_PACKAGE], + WORK_DIR, + ) + command_records.append(record) + if record.status != "pass": + return False + + record = run_command("letta-client-import", [sys.executable, "-c", "import letta_client"], WORK_DIR) + command_records.append(record) + + return record.status == "pass" + + +def write_live_runner(fixtures: list[dict[str, Any]]) -> Path: + """Write a small Python runner that uses the current Letta SDK.""" + + contract = benchmark_input_contract(fixtures) + input_path = WORK_DIR / "letta-live-input.json" + write_json(input_path, contract) + + runner = WORK_DIR / "letta_live_runner.py" + runner.write_text( + """ +import json +import os +from pathlib import Path + +from letta_client import Letta + + +def as_dict(value): + if hasattr(value, "model_dump"): + return value.model_dump(mode="json") + if hasattr(value, "dict"): + return value.dict() + return json.loads(json.dumps(value, default=str)) + + +input_path = Path(os.environ["ELF_LETTA_LIVE_INPUT"]) +output_path = Path(os.environ["ELF_LETTA_LIVE_OUTPUT"]) +data = json.loads(input_path.read_text()) + +client = Letta(base_url=os.environ["ELF_LETTA_BASE_URL"]) +agent = client.agents.create( + name=os.environ.get("ELF_LETTA_AGENT_NAME", "elf-core-archive-smoke"), + model=os.environ["ELF_LETTA_MODEL"], + embedding=os.environ["ELF_LETTA_EMBEDDING"], + memory_blocks=[ + {"label": item["label"], "value": item["value"]} + for item in data["core_blocks"] + ], +) + +created_passages = [] +for passage in data["archival_passages"]: + created_passages.append( + as_dict(client.agents.passages.create(agent_id=agent.id, text=passage["text"])) + ) + +core_block_export = [] +for item in data["core_blocks"]: + core_block_export.append( + { + "source_id": item["source_id"], + "label": item["label"], + "block": as_dict( + client.agents.blocks.retrieve(agent_id=agent.id, block_label=item["label"]) + ), + } + ) + +listed_passages = as_dict(client.agents.passages.list(agent_id=agent.id)) +search_results = [] +for source_id in data["required_evidence_ids"]: + search_results.append( + { + "query": source_id, + "response": as_dict( + client.agents.passages.search(agent_id=agent.id, query=source_id, top_k=5) + ), + } + ) + +output_path.write_text( + json.dumps( + { + "agent": as_dict(agent), + "core_block_export": core_block_export, + "created_passages": created_passages, + "archival_readback": listed_passages, + "archival_search": search_results, + }, + indent=2, + sort_keys=True, + ) + + "\\n" +) +""".lstrip(), + encoding="utf-8", + ) + + return runner + + +def run_letta(fixtures: list[dict[str, Any]], command_records: list[CommandRecord]) -> dict[str, Any] | None: + """Create the Letta benchmark agent and export readback/search data.""" + + runner = write_live_runner(fixtures) + output_path = WORK_DIR / "letta-live-output.json" + env = { + "ELF_LETTA_BASE_URL": LETTA_BASE_URL, + "ELF_LETTA_MODEL": LETTA_MODEL, + "ELF_LETTA_EMBEDDING": LETTA_EMBEDDING, + "ELF_LETTA_LIVE_INPUT": str(WORK_DIR / "letta-live-input.json"), + "ELF_LETTA_LIVE_OUTPUT": str(output_path), + "ELF_LETTA_AGENT_NAME": f"elf-core-archive-smoke-{RUN_ID}", + } + record = run_command("letta-live-export-readback", [sys.executable, str(runner)], WORK_DIR, extra_env=env) + command_records.append(record) + if record.status != "pass" or not output_path.exists(): + return None + + return json.loads(output_path.read_text(encoding="utf-8")) + + +def ids_in_payload(payload: Any, evidence_ids: list[str]) -> list[str]: + """Return evidence ids present anywhere in a JSON-compatible payload.""" + + haystack = json.dumps(payload, sort_keys=True, default=str) + return [evidence_id for evidence_id in evidence_ids if evidence_id in haystack] + + +def evidence_mapping( + fixtures: list[dict[str, Any]], + live_export: dict[str, Any] | None, + status: StatusState, +) -> dict[str, Any]: + """Map observed Letta export/readback data to fixture source ids.""" + + required_ids = all_required_evidence_ids(fixtures) + if live_export is None: + mapped_ids: list[str] = [] + else: + mapped_ids = ids_in_payload(live_export, required_ids) + + missing_ids = [evidence_id for evidence_id in required_ids if evidence_id not in mapped_ids] + jobs = [] + for fixture in fixtures: + expected = evidence_ids_for_fixture(fixture) + mapped = [evidence_id for evidence_id in expected if evidence_id in mapped_ids] + if status.result in {"blocked", "incomplete", "not_encoded"}: + job_status = status.result + reason = status.failure_reason + elif len(mapped) == len(expected): + job_status = "pass" + reason = "Letta core block export and archival readback/search mapped all required source ids." + else: + job_status = "wrong_result" + missing = [evidence_id for evidence_id in expected if evidence_id not in mapped] + reason = f"Letta export/readback missed required evidence ids: {', '.join(missing)}." + + jobs.append( + { + "job_id": fixture["job_id"], + "source_fixture": fixture["_source_path"], + "expected_evidence_ids": expected, + "mapped_evidence_ids": mapped, + "missing_evidence_ids": [evidence_id for evidence_id in expected if evidence_id not in mapped], + "status": job_status, + "reason": reason, + } + ) + + return { + "status": status.result if missing_ids or live_export is None else "pass", + "reason": status.failure_reason + if live_export is None + else ( + "Letta export/readback mapped all required fixture source ids." + if not missing_ids + else f"Letta export/readback missed required evidence ids: {', '.join(missing_ids)}." + ), + "expected_evidence_ids": required_ids, + "mapped_evidence_ids": mapped_ids, + "missing_evidence_ids": missing_ids, + "jobs": jobs, + } + + +def write_fixture_outputs( + fixtures: list[dict[str, Any]], + status: StatusState, + mapping: dict[str, Any], +) -> Path: + """Write generated Letta real_world_job fixtures.""" + + for fixture in fixtures: + generated = json.loads(json.dumps({k: v for k, v in fixture.items() if k != "_source_path"})) + generated["corpus"]["profile"] = "external_adapter" + generated["corpus"]["corpus_id"] = "letta-core-archive-export-readback-2026-06-19" + job_mapping = next(item for item in mapping["jobs"] if item["job_id"] == fixture["job_id"]) + source_answer = fixture.get("corpus", {}).get("adapter_response", {}).get("answer", {}) + generated["corpus"]["adapter_response"] = { + "adapter_id": "letta_core_archive_export_readback", + "answer": { + "content": source_answer.get("content", ""), + "claims": source_answer.get("claims", []), + "evidence_ids": evidence_ids_for_fixture(fixture), + "latency_ms": 0.0, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0, + }, + }, + } + generated["tags"] = sorted(set(generated.get("tags", []) + ["external_adapter", "letta_export_readback"])) + generated["encoding"] = {} + if job_mapping["status"] in {"blocked", "incomplete", "not_encoded"}: + generated["encoding"] = { + "status": job_mapping["status"], + "reason": job_mapping["reason"], + "follow_up": { + "title": "Produce Letta core/archive export-readback evidence", + "reason": ( + "The benchmark must export Letta core block JSON, archival readback/search JSON, " + "and fixture source ids before this scenario can be scored as pass or wrong_result." + ), + }, + } + + if job_mapping["status"] == "wrong_result": + generated["corpus"]["adapter_response"]["answer"]["evidence_ids"] = job_mapping[ + "mapped_evidence_ids" + ] + + fixture_path = FIXTURE_DIR / "core_archival_memory" / Path(fixture["_source_path"]).name + write_json(fixture_path, generated) + + return FIXTURE_DIR / "core_archival_memory" + + +def run_scored_report(fixture_path: Path, manifest_path: Path, status: StatusState) -> dict[str, Any]: + """Score the generated Letta fixtures through the real-world job runner.""" + + run_cmd = [ + "cargo", + "run", + "-p", + "elf-eval", + "--bin", + "real_world_job_benchmark", + "--", + "run", + "--fixtures", + str(fixture_path), + "--out", + str(REPORT_JSON), + "--run-id", + "real-world-memory-live-letta-core-archive", + "--adapter-id", + "letta_core_archive_export_readback", + "--adapter-name", + "Letta core/archive export-readback adapter", + "--adapter-behavior", + "docker_core_archive_export_readback", + "--adapter-storage-status", + status.setup, + "--adapter-runtime-status", + status.overall, + "--adapter-notes", + "Generated by the Letta core/archive export-readback smoke; pass requires exported core block JSON, archival readback/search JSON, and mapped fixture source ids.", + "--external-adapter-manifest", + str(manifest_path), + ] + publish_cmd = [ + "cargo", + "run", + "-p", + "elf-eval", + "--bin", + "real_world_job_benchmark", + "--", + "publish", + "--report", + str(REPORT_JSON), + "--out", + str(REPORT_MD), + ] + + subprocess.run(run_cmd, cwd=ROOT_DIR, check=True) + subprocess.run(publish_cmd, cwd=ROOT_DIR, check=True) + + report = json.loads(REPORT_JSON.read_text(encoding="utf-8")) + return { + "json": rel(REPORT_JSON), + "markdown": rel(REPORT_MD), + "summary": report.get("summary", {}), + "suites": report.get("suites", []), + } + + +def scored_benchmark(report: dict[str, Any] | None) -> dict[str, Any]: + """Extract the post-score benchmark status from a real_world_job report.""" + + if report is None: + return { + "schema": "elf.scored_benchmark_status/v1", + "source": "real_world_job_benchmark", + "status": "pending", + "reason": "The Letta smoke materialization was written before benchmark scoring completed.", + } + + summary = report.get("summary", {}) + counts = { + status: int(summary.get(status, 0) or 0) + for status in ("pass", "wrong_result", "lifecycle_fail", "incomplete", "blocked", "not_encoded") + } + status = next((name for name, count in counts.items() if name != "pass" and count > 0), "pass") + + return { + "schema": "elf.scored_benchmark_status/v1", + "source": "real_world_job_benchmark", + "status": status, + "counts": counts, + "job_count": int(summary.get("job_count", 0) or 0), + "mean_score": summary.get("mean_score"), + "evidence_coverage": summary.get("evidence_coverage"), + } + + +def write_materialization( + status: StatusState, + fixtures: list[dict[str, Any]], + fixture_path: Path, + command_records: list[CommandRecord], + live_export: dict[str, Any] | None, + mapping: dict[str, Any], + started_at: float, + report: dict[str, Any] | None = None, +) -> dict[str, Any]: + """Write the primary Letta materialization artifact.""" + + elapsed_ms = (time.monotonic() - started_at) * 1000 + payload = { + "schema": "elf.letta_core_archive_export_readback/v1", + "generated_at": utc_now(), + "run_id": RUN_ID, + "adapter_id": "letta_core_archive_export_readback", + "project": "Letta", + "evidence_class": status.evidence_class, + "status": { + "source": "smoke_materialization", + "setup": status.setup, + "run": status.run, + "result": status.result, + "overall": status.overall, + "failure_class": status.failure_class, + "failure_reason": status.failure_reason, + }, + "scored_benchmark": scored_benchmark(report), + "artifacts": { + "materialization": rel(OUT), + "manifest": rel(MANIFEST_OUT), + "summary": rel(SUMMARY_OUT), + "generated_fixture_dir": rel(fixture_path), + "scored_report_json": rel(REPORT_JSON), + "scored_report_markdown": rel(REPORT_MD), + "live_output": rel(WORK_DIR / "letta-live-output.json") + if (WORK_DIR / "letta-live-output.json").exists() + else None, + }, + "docker_boundary": { + "compose_file": "docker-compose.baseline.yml", + "service_profile": "letta", + "runner_service": "baseline-runner", + "runner": "scripts/letta-core-archive-export-readback-smoke.py", + "host_global_installs_required": False, + "docker_only": True, + "host_global_letta_state_used": False, + "hosted_letta_state_used": False, + }, + "provider_configuration": { + "base_url": LETTA_BASE_URL, + "client_package": LETTA_CLIENT_REF, + "model": LETTA_MODEL, + "embedding": LETTA_EMBEDDING, + "live_run_enabled": RUN_LIVE, + "operator_owned_provider_credentials_used": False, + }, + "benchmark_input": benchmark_input_contract(fixtures), + "letta_export": { + "core_block_json": live_export.get("core_block_export", []) if live_export else [], + "archival_readback_json": live_export.get("archival_readback") if live_export else None, + "archival_search_json": live_export.get("archival_search", []) if live_export else [], + "agent": live_export.get("agent") if live_export else None, + "status": "exported" if live_export else status.result, + }, + "resource_bounds": { + "source_fixture_count": len(fixtures), + "core_block_count": len(benchmark_input_contract(fixtures)["core_blocks"]), + "archival_passage_count": len(benchmark_input_contract(fixtures)["archival_passages"]), + "timeout_seconds": TIMEOUT_SECONDS, + "elapsed_ms": round(elapsed_ms, 3), + }, + "commands": [command_to_json(record) for record in command_records], + "evidence_mapping": mapping, + "improvement_regression_readback": { + "baseline": "XY-955 left Letta core/archive comparison blocked because no contained export/readback artifact existed.", + "current": ( + "unchanged: the benchmark now has a Docker-contained materialization command and typed report, " + "but the default run still preserves Letta comparison as blocked until live export/search data maps source ids." + ) + if status.result != "pass" + else "improved: Letta export/readback mapped all required core/archive source ids.", + "judgment": "improved" if status.result == "pass" else "unchanged", + }, + "claim_boundaries": { + "allowed": [ + "The Letta comparison now has a reproducible Docker-contained materialization/report command.", + "The current default report may preserve typed blockers when live Letta/provider setup cannot produce export/readback evidence.", + ], + "not_allowed": [ + "Do not claim ELF beats Letta on core-vs-archival memory from fixture-only ELF evidence.", + "Do not score Letta pass, win, tie, or loss unless exported core block JSON, archival readback/search JSON, and fixture source ids are present.", + ], + }, + } + write_json(OUT, payload) + + return payload + + +def write_manifest(status: StatusState) -> dict[str, Any]: + """Write a generated external adapter manifest for this smoke.""" + + manifest = { + "schema": "elf.real_world_external_adapter_manifest/v1", + "manifest_id": f"letta-core-archive-export-readback-{RUN_ID}", + "docker_isolation": { + "default": True, + "compose_file": "docker-compose.baseline.yml", + "runner": "scripts/letta-core-archive-export-readback-smoke.py", + "artifact_dir": "tmp/real-world-memory/letta-core-archive", + "host_global_installs_required": False, + "notes": [ + f"Generated by the Letta core/archive export-readback smoke at {utc_now()}.", + "The smoke uses checked-in core_archival_memory fixtures and records typed setup/runtime failures.", + ], + }, + "adapters": [ + { + "adapter_id": "letta_core_archive_export_readback", + "project": "Letta", + "adapter_kind": "docker_core_archive_export_readback", + "evidence_class": status.evidence_class, + "docker_default": True, + "host_global_installs_required": False, + "overall_status": status.overall, + "setup": { + "status": status.setup, + "evidence": "The smoke runs inside the baseline Docker runner and can use a Docker-profile Letta server with explicit model and embedding configuration.", + "command": "cargo make smoke-letta-core-archive-export-readback", + "artifact": rel(OUT), + }, + "run": { + "status": status.run, + "evidence": "The live path creates a benchmark-owned Letta agent, imports fixture source ids into core blocks and archival passages, then exports block/readback/search JSON.", + "command": "ELF_LETTA_SMOKE_START=1 ELF_LETTA_SMOKE_RUN=1 cargo make smoke-letta-core-archive-export-readback", + "artifact": rel(OUT), + }, + "result": { + "status": status.result, + "evidence": status.failure_reason + if status.failure_reason + else "Letta core block export, archival readback, and archival search mapped required fixture source ids.", + "artifact": rel(OUT), + }, + "capabilities": [ + { + "capability": "docker_letta_server_boundary", + "status": status.setup, + "evidence": "The runner uses docker-compose.baseline.yml and avoids host-global Letta state or hosted/private agents.", + }, + { + "capability": "core_block_export", + "status": status.run, + "evidence": "Live scoring requires retrieving Letta memory blocks with fixture source ids embedded in block values.", + }, + { + "capability": "archival_readback_search_export", + "status": status.result, + "evidence": "Live scoring requires archival passage list/search JSON to map required source ids.", + }, + { + "capability": "broad_letta_quality_claim", + "status": "not_encoded", + "evidence": "The smoke does not claim broad Letta product quality, private corpus behavior, or hosted-service parity.", + }, + ], + "suites": [ + { + "suite_id": "core_archival_memory", + "status": status.result, + "evidence": "Only the six checked-in core_archival_memory scenarios are represented.", + }, + { + "suite_id": "personalization", + "status": "not_encoded", + "evidence": "Scoped preference behavior is outside this core/archive export smoke.", + }, + { + "suite_id": "project_decisions", + "status": status.result, + "evidence": "Project-decision recovery is scored only through the core_archival_memory fixture that requires core routing plus archival rationale source ids.", + }, + { + "suite_id": "work_resume", + "status": "not_encoded", + "evidence": "Agent resumption across sessions is not encoded by this export/readback smoke.", + }, + ], + "evidence": [ + {"kind": "artifact", "ref": rel(OUT), "status": status.result}, + {"kind": "manifest", "ref": rel(MANIFEST_OUT), "status": status.overall}, + {"kind": "source", "ref": "https://docs.letta.com/guides/docker", "status": "real"}, + {"kind": "source", "ref": "https://docs.letta.com/api/python", "status": "real"}, + { + "kind": "source", + "ref": "https://docs.letta.com/api/resources/agents/subresources/passages/methods/search", + "status": "real", + }, + ], + "execution_metadata": { + "sources": [ + { + "label": "Letta Docker docs", + "url": "https://docs.letta.com/guides/docker", + "evidence": "Official Docker setup and explicit embedding configuration boundary.", + }, + { + "label": "Letta Python API", + "url": "https://docs.letta.com/api/python", + "evidence": "Official Python SDK memory block creation and retrieval examples.", + }, + { + "label": "Letta archival search API", + "url": "https://docs.letta.com/api/resources/agents/subresources/passages/methods/search", + "evidence": "Official archival-memory search endpoint contract.", + }, + ], + "setup_path": "Run cargo make smoke-letta-core-archive-export-readback for a typed artifact; set ELF_LETTA_SMOKE_START=1 ELF_LETTA_SMOKE_RUN=1 with explicit model/provider configuration for a live export attempt.", + "runtime_boundary": "docker-compose.baseline.yml baseline-runner plus optional Letta server profile, benchmark-created agent, benchmark-owned fixture corpus, and artifacts under tmp/real-world-memory/letta-core-archive.", + "resource_expectation": f"Letta client {LETTA_CLIENT_REF}, model={LETTA_MODEL}, embedding={LETTA_EMBEDDING}, source fixture count=6, timeout_seconds={TIMEOUT_SECONDS}.", + "retry_guidance": [ + "Default command records a typed blocked artifact without model calls.", + "Enable the live path only with a Docker-local Letta server and explicit provider or local model configuration.", + "Score only when core block export and archival list/search output map to required fixture source ids.", + ], + "research_depth": "XY-984 materialization contract; generated artifact decides live evidence class.", + }, + "notes": [ + "Failure before Letta export/readback remains typed as blocked or incomplete.", + "The smoke does not use hosted/private Letta state or operator-owned data.", + ], + } + ], + } + write_json(MANIFEST_OUT, manifest) + + return manifest + + +def write_summary(materialization: dict[str, Any], manifest: dict[str, Any], report: dict[str, Any]) -> None: + """Write a small summary artifact.""" + + write_json( + SUMMARY_OUT, + { + "schema": "elf.letta_core_archive_export_readback_summary/v1", + "generated_at": utc_now(), + "adapter_id": "letta_core_archive_export_readback", + "evidence_class": materialization["evidence_class"], + "status_boundary": { + "materialization": "setup/run/evidence-mapping state emitted by the smoke runner", + "manifest": "external adapter declaration consumed by the scorer", + "scored_benchmark": "post-score real_world_job outcome; use this for quality status", + }, + "scored_benchmark": materialization["scored_benchmark"], + "materialization": materialization, + "manifest": { + "json": rel(MANIFEST_OUT), + "status_source": "external_adapter_manifest_score_aligned", + "summary": manifest["adapters"][0]["overall_status"], + "suites": manifest["adapters"][0]["suites"], + }, + "report": report, + }, + ) + + +def main() -> int: + """Run the smoke and always emit typed artifacts when possible.""" + + started_at = time.monotonic() + mkdirs() + status = StatusState() + command_records: list[CommandRecord] = [] + fixtures = load_source_fixtures() + live_export: dict[str, Any] | None = None + + if not Path("/.dockerenv").exists() and not ALLOW_HOST: + status.setup = "incomplete" + status.result = "incomplete" + status.overall = "incomplete" + status.failure_class = "not_running_in_docker" + status.failure_reason = "Letta smoke must run inside Docker; use cargo make smoke-letta-core-archive-export-readback." + elif not command_available("python3"): + status.setup = "incomplete" + status.result = "incomplete" + status.overall = "incomplete" + status.failure_class = "python_missing" + status.failure_reason = "python3 is required for the Letta smoke runner." + elif not RUN_LIVE: + pass + elif not wait_for_letta(command_records): + status.setup = "incomplete" + status.result = "incomplete" + status.overall = "incomplete" + status.failure_class = "letta_server_unreachable" + status.failure_reason = "Docker-local Letta server did not become reachable for export/readback." + elif not init_letta_client(command_records): + status.setup = "incomplete" + status.result = "incomplete" + status.overall = "incomplete" + status.failure_class = "letta_client_setup_failed" + status.failure_reason = "Letta Python client installation or import failed inside the Docker runner." + else: + status.setup = "pass" + live_export = run_letta(fixtures, command_records) + if live_export is None: + status.run = "incomplete" + status.result = "incomplete" + status.overall = "incomplete" + status.failure_class = "letta_export_readback_failed" + status.failure_reason = "Letta did not produce core block export plus archival readback/search output." + else: + status.run = "pass" + status.evidence_class = "live_real_world" + mapping = evidence_mapping(fixtures, live_export, status) + if not mapping["missing_evidence_ids"]: + status.result = "pass" + status.overall = "pass" + status.failure_class = "" + status.failure_reason = "" + else: + status.result = "wrong_result" + status.overall = "wrong_result" + status.failure_class = "letta_source_id_mapping_failed" + status.failure_reason = mapping["reason"] + + mapping = evidence_mapping(fixtures, live_export, status) + fixture_path = write_fixture_outputs(fixtures, status, mapping) + write_materialization( + status, + fixtures, + fixture_path, + command_records, + live_export, + mapping, + started_at, + ) + manifest = write_manifest(status) + report = run_scored_report(fixture_path, MANIFEST_OUT, status) + materialization = write_materialization( + status, + fixtures, + fixture_path, + command_records, + live_export, + mapping, + started_at, + report, + ) + write_summary(materialization, manifest, report) + print(f"Letta core/archive artifact: {OUT}") + print(f"Letta core/archive manifest: {MANIFEST_OUT}") + print(f"Letta core/archive summary: {SUMMARY_OUT}") + + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/scripts/smoke-docker.sh b/scripts/smoke-docker.sh index 6aa816a8..a4464a46 100755 --- a/scripts/smoke-docker.sh +++ b/scripts/smoke-docker.sh @@ -70,6 +70,31 @@ graphrag-docker) -e ELF_GRAPHRAG_MAX_INPUT_CHARS \ baseline-runner python3 scripts/graphrag-docker-smoke.py ;; +letta-core-archive-export-readback) + start="$(printenv ELF_LETTA_SMOKE_START || true)" + status=0 + if [ "$start" = "1" ]; then + docker compose -f docker-compose.baseline.yml --profile letta up -d letta + fi + docker compose -f docker-compose.baseline.yml run --build --rm \ + -e ELF_LETTA_SMOKE_RUN \ + -e ELF_LETTA_SMOKE_REPORT_DIR \ + -e ELF_LETTA_SMOKE_WORK_DIR \ + -e ELF_LETTA_SMOKE_INSTALL_CLIENT \ + -e ELF_LETTA_CLIENT_PACKAGE \ + -e ELF_LETTA_CLIENT_REF \ + -e ELF_LETTA_BASE_URL \ + -e ELF_LETTA_MODEL \ + -e ELF_LETTA_EMBEDDING \ + -e ELF_LETTA_TIMEOUT_SECONDS \ + -e ELF_LETTA_STARTUP_ATTEMPTS \ + -e ELF_LETTA_STARTUP_INTERVAL_SECONDS \ + baseline-runner python3 scripts/letta-core-archive-export-readback-smoke.py || status=$? + if [ "$start" = "1" ]; then + docker compose -f docker-compose.baseline.yml --profile letta stop letta >/dev/null 2>&1 || true + fi + exit "$status" + ;; lightrag-docker-context) start="$(printenv ELF_LIGHTRAG_CONTEXT_START || true)" status=0