diff --git a/Makefile.toml b/Makefile.toml index e11d12b9..5ad26b18 100644 --- a/Makefile.toml +++ b/Makefile.toml @@ -31,6 +31,9 @@ # | real-world-memory-p1-closeout | composite | | # | real-world-memory-p1-closeout-json | command | | # | real-world-memory-p1-closeout-report | command | | +# | real-world-memory-p4-production-readiness | composite | | +# | real-world-memory-p4-production-readiness-json | command | | +# | real-world-memory-p4-production-readiness-report | command | | # | real-world-memory-p2-knowledge-closeout | composite | | # | real-world-memory-core-archival | composite | | # | real-world-memory-core-archival-json | command | | @@ -429,6 +432,55 @@ args = [ "tmp/real-world-memory/p1-closeout/report.md", ] +[tasks.real-world-memory-p4-production-readiness] +workspace = false +dependencies = [ + "real-world-memory-p4-production-readiness-report", +] + +[tasks.real-world-memory-p4-production-readiness-json] +workspace = false +command = "cargo" +args = [ + "run", + "-p", + "elf-eval", + "--bin", + "real_world_job_benchmark", + "--", + "run", + "--fixtures", + "apps/elf-eval/fixtures/real_world_memory/production_ops", + "--out", + "tmp/real-world-memory/p4-production-readiness/report.json", + "--run-id", + "real-world-memory-p4-production-readiness", + "--adapter-id", + "fixture_production_ops", + "--adapter-name", + "ELF P4 production-readiness fixture", +] + +[tasks.real-world-memory-p4-production-readiness-report] +workspace = false +dependencies = [ + "real-world-memory-p4-production-readiness-json", +] +command = "cargo" +args = [ + "run", + "-p", + "elf-eval", + "--bin", + "real_world_job_benchmark", + "--", + "publish", + "--report", + "tmp/real-world-memory/p4-production-readiness/report.json", + "--out", + "tmp/real-world-memory/p4-production-readiness/report.md", +] + [tasks.real-world-memory-p2-knowledge-closeout] workspace = false dependencies = [ diff --git a/README.md b/README.md index fd009593..2b52d0d4 100644 --- a/README.md +++ b/README.md @@ -308,6 +308,14 @@ provider-backed ELF evidence was required. OpenViking trajectory, and graph/RAG citation/navigation remain optimization inputs or typed blockers. The report makes P4 queue items inspectable but applies no `decodex:queued:elf` label. +- P4 production-readiness evidence gates after XY-1074: the June 23 follow-up adds + `cargo make real-world-memory-p4-production-readiness`, a checked-in evidence + report, and `elf.operational_evidence_gates/v1`. The production-ops slice scores + 7 jobs, 5 pass, 0 wrong_result, 0 incomplete, and 2 typed blockers while separating + local fixture, public-proxy, private-corpus, and provider-backed tiers. It records + latency, cost, resource, cold-start, restore, and Qdrant rebuild metrics, but keeps + missing private-corpus manifests and provider credentials as typed blockers rather + than private/provider-backed pass proof. - Operator-approved public-proxy addendum after XY-930: the June 19 follow-up runs `cargo make baseline-production-private-addendum` with a simulated/public-proxy production corpus manifest approved for this stage. The run records 12 documents, @@ -451,6 +459,7 @@ Detailed evidence and interpretation: - [Temporal and Trajectory Adapter Coverage Report - June 23, 2026](docs/evidence/benchmarking/2026-06-23-temporal-trajectory-adapter-coverage-report.md) - [Graph/RAG Adapter Matrix Report - June 23, 2026](docs/evidence/benchmarking/2026-06-23-graph-rag-adapter-matrix-report.md) - [P3 Competitor-Strength Absorption Report - June 23, 2026](docs/evidence/benchmarking/2026-06-23-p3-competitor-strength-absorption-report.md) +- [P4 Production-Readiness Evidence Gates Report - June 23, 2026](docs/evidence/benchmarking/2026-06-23-p4-production-readiness-evidence-gates-report.md) - [Live Baseline Benchmark Runbook](docs/runbook/benchmarking/live_baseline_benchmark.md) - [Single-User Production Runbook](docs/runbook/single_user_production.md) - Benchmark contract: @@ -547,6 +556,7 @@ Detailed comparison, mechanism-level analysis, and source map: - [Temporal and Trajectory Adapter Coverage Report - June 23, 2026](docs/evidence/benchmarking/2026-06-23-temporal-trajectory-adapter-coverage-report.md) - [Graph/RAG Adapter Matrix Report - June 23, 2026](docs/evidence/benchmarking/2026-06-23-graph-rag-adapter-matrix-report.md) - [P3 Competitor-Strength Absorption Report - June 23, 2026](docs/evidence/benchmarking/2026-06-23-p3-competitor-strength-absorption-report.md) +- [P4 Production-Readiness Evidence Gates Report - June 23, 2026](docs/evidence/benchmarking/2026-06-23-p4-production-readiness-evidence-gates-report.md) - [Live Baseline Benchmark Runbook](docs/runbook/benchmarking/live_baseline_benchmark.md) - [Real-World Agent Memory Benchmark](docs/runbook/benchmarking/real_world_agent_memory_benchmark.md) - [External Memory Improvement Plan](docs/evidence/external_memory/external_memory_improvement_plan.md) @@ -566,7 +576,8 @@ Report - June 20, 2026, and the Live Knowledge-Page Rebuild/Lint Report - June 2 Workspace PageIndex/OpenKB Closeout Report, PageIndex/OpenKB Same-Corpus Adapter Report, and mem0/OpenMemory and Letta Memory-History/Core-Archive Adapter Report; June 23 adds the Temporal and Trajectory Adapter Coverage Report, the Graph/RAG -Adapter Matrix Report, and the P3 Competitor-Strength Absorption Report after the +Adapter Matrix Report, the P3 Competitor-Strength Absorption Report, and the P4 +Production-Readiness Evidence Gates Report after the June 19 XY-930 operator-approved public-proxy production addendum and service-native Dreaming readback, the qmd debug-ergonomics Dreaming retest, the June 17 competitor-strength closeout, and the June 16 temporal reconciliation, live diff --git a/apps/elf-eval/fixtures/real_world_memory/production_ops/credential_boundary_provider_blocked.json b/apps/elf-eval/fixtures/real_world_memory/production_ops/credential_boundary_provider_blocked.json index f0a46864..0b7a1552 100644 --- a/apps/elf-eval/fixtures/real_world_memory/production_ops/credential_boundary_provider_blocked.json +++ b/apps/elf-eval/fixtures/real_world_memory/production_ops/credential_boundary_provider_blocked.json @@ -195,5 +195,12 @@ "acceptable_phrases": ["blocked until operator credentials are supplied", "must not require user secrets"], "fallback_action": "state_blocker" }, - "tags": ["external_adapter", "production_ops", "credential_boundary", "blocked", "no_live_claim"] + "tags": [ + "external_adapter", + "provider_backed", + "production_ops", + "credential_boundary", + "blocked", + "no_live_claim" + ] } diff --git a/apps/elf-eval/fixtures/real_world_memory/production_ops/public_proxy_production_private_addendum.json b/apps/elf-eval/fixtures/real_world_memory/production_ops/public_proxy_production_private_addendum.json new file mode 100644 index 00000000..a7032fce --- /dev/null +++ b/apps/elf-eval/fixtures/real_world_memory/production_ops/public_proxy_production_private_addendum.json @@ -0,0 +1,242 @@ +{ + "schema": "elf.real_world_job/v1", + "job_id": "production-ops-public-proxy-addendum-001", + "suite": "production_ops", + "title": "Separate operator-approved public-proxy evidence from private and provider proof", + "corpus": { + "corpus_id": "real-world-memory-production-ops-2026-06-19-public-proxy", + "profile": "generated_public", + "items": [ + { + "evidence_id": "public-proxy-addendum-pass", + "kind": "benchmark_report", + "text": "The operator-approved public-proxy production-private addendum passed 8/8 query checks with 0 wrong_result, 0 lifecycle_fail, 0 blocked, 0 incomplete, and 0 not_encoded.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "public_proxy_production_private_addendum", + "evidence_id": "public-proxy-addendum-pass" + }, + "locator": { + "quote": "passed 8/8 query checks with 0 wrong_result" + } + }, + "created_at": "2026-06-19T14:40:13Z" + }, + { + "evidence_id": "public-proxy-latency-resource-cost", + "kind": "benchmark_report", + "text": "The public-proxy addendum recorded query mean latency 10.842727625 ms, P95 30.443385 ms, elapsed 1.313984156 seconds, RSS 37656 KB, and 386 estimated input tokens with no configured cost rate.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "public_proxy_production_private_addendum", + "evidence_id": "public-proxy-latency-resource-cost" + }, + "locator": { + "quote": "query mean latency 10.842727625 ms" + } + }, + "created_at": "2026-06-19T14:40:13Z" + }, + { + "evidence_id": "public-proxy-claim-boundary", + "kind": "decision", + "text": "The public-proxy addendum is not real private-corpus production proof and does not prove provider-backed production quality because embedding mode was local-hash.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "public_proxy_production_private_addendum", + "evidence_id": "public-proxy-claim-boundary" + }, + "locator": { + "quote": "not real private-corpus production proof" + } + }, + "created_at": "2026-06-19T14:40:13Z" + }, + { + "evidence_id": "public-proxy-private-proof-decoy", + "kind": "decision", + "text": "Decoy: the public-proxy addendum proves real private-corpus production quality and provider-backed embedding quality.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "public_proxy_production_private_addendum", + "evidence_id": "public-proxy-private-proof-decoy" + } + }, + "created_at": "2026-06-19T14:40:13Z" + } + ], + "adapter_response": { + "adapter_id": "fixture_production_ops", + "answer": { + "content": "The operator-approved public-proxy addendum passed 8/8 query checks with 0 wrong_result. It recorded query mean latency 10.842727625 ms, P95 30.443385 ms, elapsed 1.313984156 seconds, RSS 37656 KB, and 386 estimated input tokens. This is public-proxy evidence only: it is not real private-corpus production proof and does not prove provider-backed production quality because embedding mode was local-hash.", + "claims": [ + { + "claim_id": "public_proxy_passed", + "text": "The operator-approved public-proxy addendum passed 8/8 query checks with 0 wrong_result.", + "evidence_ids": [ + "public-proxy-addendum-pass" + ], + "confidence": "high" + }, + { + "claim_id": "public_proxy_operational_envelope", + "text": "The public-proxy addendum recorded latency, resource, and token cost-proxy metrics.", + "evidence_ids": [ + "public-proxy-latency-resource-cost" + ], + "confidence": "high" + }, + { + "claim_id": "public_proxy_boundary", + "text": "The public-proxy addendum is not private-corpus or provider-backed production proof.", + "evidence_ids": [ + "public-proxy-claim-boundary" + ], + "confidence": "high" + } + ], + "evidence_ids": [ + "public-proxy-addendum-pass", + "public-proxy-latency-resource-cost", + "public-proxy-claim-boundary" + ], + "latency_ms": 10.842727625, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 386, + "output_tokens": 0 + } + } + } + }, + "negative_traps": [ + { + "trap_id": "public-proxy-as-private-proof", + "type": "unsupported_claim", + "evidence_ids": [ + "public-proxy-private-proof-decoy" + ], + "failure_if_used": true + } + ], + "prompt": { + "role": "user", + "content": "What operational evidence did the public-proxy production-private addendum add, and what must it not be called?", + "job_mode": "operate", + "constraints": [ + "cite_evidence", + "record_latency_cost_resource", + "separate_public_proxy_from_private_provider_proof" + ] + }, + "expected_answer": { + "must_include": [ + { + "claim_id": "public_proxy_passed", + "text": "The operator-approved public-proxy addendum passed 8/8 query checks with 0 wrong_result." + }, + { + "claim_id": "public_proxy_operational_envelope", + "text": "The public-proxy addendum recorded latency, resource, and token cost-proxy metrics." + }, + { + "claim_id": "public_proxy_boundary", + "text": "The public-proxy addendum is not private-corpus or provider-backed production proof." + } + ], + "must_not_include": [ + "proves real private-corpus production quality", + "provider-backed embedding quality" + ], + "evidence_links": { + "public_proxy_passed": [ + "public-proxy-addendum-pass" + ], + "public_proxy_operational_envelope": [ + "public-proxy-latency-resource-cost" + ], + "public_proxy_boundary": [ + "public-proxy-claim-boundary" + ] + }, + "answer_type": "direct_answer", + "accepted_alternates": [], + "requires_caveat": true, + "requires_refusal": false + }, + "required_evidence": [ + { + "evidence_id": "public-proxy-addendum-pass", + "claim_id": "public_proxy_passed", + "requirement": "cite", + "quote": "passed 8/8 query checks with 0 wrong_result" + }, + { + "evidence_id": "public-proxy-latency-resource-cost", + "claim_id": "public_proxy_operational_envelope", + "requirement": "cite", + "quote": "query mean latency 10.842727625 ms" + }, + { + "evidence_id": "public-proxy-claim-boundary", + "claim_id": "public_proxy_boundary", + "requirement": "cite", + "quote": "not real private-corpus production proof" + } + ], + "scoring_rubric": { + "dimensions": { + "latency_resource": { + "weight": 0.3, + "max_points": 1.0, + "criteria": { + "max_latency_ms": 50.0, + "resource_expectation": "Report public-proxy latency, resource, and token cost-proxy metrics." + } + }, + "evidence_grounding": { + "weight": 0.3, + "max_points": 1.0, + "criteria": "Cites public-proxy pass, operational envelope, and claim-boundary evidence." + }, + "trap_avoidance": { + "weight": 0.25, + "max_points": 1.0, + "criteria": "Does not convert public-proxy evidence into private or provider proof." + }, + "workflow_helpfulness": { + "weight": 0.15, + "max_points": 1.0, + "criteria": "Explains how to use the proxy evidence safely." + } + }, + "pass_threshold": 0.85, + "hard_fail_rules": [ + "unsupported high-confidence claim about a required decision or fact", + "use of a negative trap marked failure_if_used = true" + ] + }, + "allowed_uncertainty": { + "can_answer_unknown": false, + "acceptable_phrases": [ + "not real private-corpus production proof" + ], + "fallback_action": "state_blocker" + }, + "tags": [ + "generated_public", + "public_proxy", + "production_ops", + "resource_envelope", + "no_live_claim" + ] +} diff --git a/apps/elf-eval/fixtures/report_snapshots/2026-06-23-p4-production-readiness-evidence-gates-report.json b/apps/elf-eval/fixtures/report_snapshots/2026-06-23-p4-production-readiness-evidence-gates-report.json new file mode 100644 index 00000000..f274f7fe --- /dev/null +++ b/apps/elf-eval/fixtures/report_snapshots/2026-06-23-p4-production-readiness-evidence-gates-report.json @@ -0,0 +1,4597 @@ +{ + "schema": "elf.real_world_job_report/v1", + "run_id": "real-world-memory-p4-production-readiness", + "generated_at": "2026-06-22T21:44:10.104652Z", + "runner_version": "0.2.0-846880fe650fae351131c433cd45773485c5a383-aarch64-apple-darwin", + "corpus_profile": "mixed", + "adapter": { + "adapter_id": "fixture_production_ops", + "name": "ELF P4 production-readiness fixture", + "behavior": "offline_fixture_response", + "storage": "not_encoded", + "runtime": "not_encoded", + "notes": "Offline runner scores checked-in fixture responses; it does not exercise a live external adapter." + }, + "scoreboard": { + "schema": "elf.quality_scoreboard/v1", + "result_states": [ + "pass", + "wrong_result", + "incomplete", + "blocked", + "not_tested", + "not_encoded", + "unsupported_claim" + ], + "evidence_classes": [ + "fixture_backed", + "live_baseline", + "live_real_world", + "research_gate" + ], + "job_typed_non_pass_count": 2, + "job_typed_non_pass_states_present": [ + "blocked" + ], + "job_summary_claim": "typed_non_pass_present", + "external_adapter_typed_non_pass_count": 220, + "external_adapter_typed_non_pass_states_present": [ + "blocked", + "incomplete", + "not_encoded", + "not_tested", + "wrong_result" + ], + "typed_non_pass_count": 222, + "typed_non_pass_states_present": [ + "blocked", + "incomplete", + "not_encoded", + "not_tested", + "wrong_result" + ], + "evidence_class_counts": { + "fixture_backed": 1, + "live_baseline": 6, + "live_real_world": 5, + "research_gate": 11 + }, + "summary_claim": "typed_non_pass_present", + "unqualified_win_claim_allowed": false, + "claim_boundary": "Typed non-pass states and non-live evidence classes must remain visible; reports must not collapse them into unqualified wins." + }, + "operational_evidence": { + "schema": "elf.operational_evidence_gates/v1", + "tiers": [ + { + "tier": "local_fixture", + "status": "pass", + "job_count": 4, + "pass": 4, + "wrong_result": 0, + "lifecycle_fail": 0, + "incomplete": 0, + "blocked": 0, + "not_encoded": 0, + "unsupported_claim": 0, + "mean_latency_ms": 2.05, + "total_cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + }, + "resource_evidence_count": 1, + "cold_start_evidence_count": 2, + "restore_evidence_count": 1, + "qdrant_rebuild_evidence_count": 1, + "pass_claim_allowed": true, + "blocker_reasons": [], + "job_ids": [ + "production-ops-restore-cold-start-001", + "production-ops-cold-start-dependency-001", + "production-ops-backfill-resume-001", + "production-ops-resource-envelope-001" + ] + }, + { + "tier": "public_proxy", + "status": "pass", + "job_count": 1, + "pass": 1, + "wrong_result": 0, + "lifecycle_fail": 0, + "incomplete": 0, + "blocked": 0, + "not_encoded": 0, + "unsupported_claim": 0, + "mean_latency_ms": 10.843, + "total_cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 386, + "output_tokens": 0 + }, + "resource_evidence_count": 1, + "cold_start_evidence_count": 0, + "restore_evidence_count": 0, + "qdrant_rebuild_evidence_count": 0, + "pass_claim_allowed": true, + "blocker_reasons": [], + "job_ids": [ + "production-ops-public-proxy-addendum-001" + ] + }, + { + "tier": "private_corpus", + "status": "blocked", + "job_count": 1, + "pass": 0, + "wrong_result": 0, + "lifecycle_fail": 0, + "incomplete": 0, + "blocked": 1, + "not_encoded": 0, + "unsupported_claim": 0, + "mean_latency_ms": 1.6, + "total_cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + }, + "resource_evidence_count": 0, + "cold_start_evidence_count": 0, + "restore_evidence_count": 0, + "qdrant_rebuild_evidence_count": 0, + "pass_claim_allowed": false, + "blocker_reasons": [ + "No operator-owned private production corpus manifest is checked in or available to this fixture; no private-corpus pass can be claimed." + ], + "job_ids": [ + "production-ops-private-manifest-blocked-001" + ] + }, + { + "tier": "provider_backed", + "status": "blocked", + "job_count": 1, + "pass": 0, + "wrong_result": 0, + "lifecycle_fail": 0, + "incomplete": 0, + "blocked": 1, + "not_encoded": 0, + "unsupported_claim": 0, + "mean_latency_ms": 1.7, + "total_cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + }, + "resource_evidence_count": 0, + "cold_start_evidence_count": 0, + "restore_evidence_count": 0, + "qdrant_rebuild_evidence_count": 0, + "pass_claim_allowed": false, + "blocker_reasons": [ + "Provider-backed production operations require operator-owned credentials; checked-in fixtures must not include or require secrets." + ], + "job_ids": [ + "production-ops-credential-boundary-001" + ] + } + ], + "latency": { + "measured_job_count": 7, + "missing_latency_job_count": 0, + "mean_ms": 3.192, + "max_ms": 10.843 + }, + "cost": { + "jobs_with_cost_report": 7, + "missing_cost_job_count": 0, + "zero_cost_job_count": 7, + "total": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 386, + "output_tokens": 0 + }, + "claim_boundary": "Fixture and local-provider zero-cost reports are execution-accounting evidence only; they do not prove hosted provider spend." + }, + "resource": { + "resource_envelope_job_count": 2, + "resource_envelope_pass_count": 2, + "latency_resource_dimension_job_count": 2, + "job_ids": [ + "production-ops-public-proxy-addendum-001", + "production-ops-resource-envelope-001" + ] + }, + "cold_start_restore_rebuild": { + "cold_start_job_count": 2, + "cold_start_pass_count": 2, + "restore_job_count": 1, + "restore_pass_count": 1, + "qdrant_rebuild_job_count": 1, + "qdrant_rebuild_pass_count": 1, + "job_ids": [ + "production-ops-cold-start-dependency-001", + "production-ops-restore-cold-start-001" + ] + }, + "missing_private_provider_inputs_are_typed_blockers": true, + "private_corpus_pass_claim_allowed": false, + "provider_backed_pass_claim_allowed": false, + "claim_boundary": "Operational evidence tiers are separate: local fixture and public-proxy passes do not prove private-corpus or provider-backed production quality." + }, + "external_adapters": { + "schema": "elf.real_world_external_adapter_report/v1", + "manifest_id": "real-world-memory-project-adapters-2026-06-11-first-generation-continuity-source-store", + "docker_isolation": { + "default": true, + "compose_file": "docker-compose.baseline.yml", + "runner": "scripts/live-baseline-benchmark.sh", + "artifact_dir": "tmp/live-baseline/", + "host_global_installs_required": false, + "notes": [ + "External project runs default to Docker Compose and Docker-managed caches.", + "Real-world job fixture reports and live baseline reports use separate schemas and claim boundaries." + ] + }, + "summary": { + "adapter_count": 23, + "external_project_count": 16, + "docker_default_count": 23, + "host_global_install_required_count": 0, + "fixture_backed_count": 1, + "live_baseline_only_count": 6, + "live_real_world_count": 5, + "research_gate_count": 11, + "overall_status_counts": { + "real": 0, + "mocked": 0, + "unsupported": 0, + "blocked": 7, + "incomplete": 0, + "wrong_result": 6, + "lifecycle_fail": 1, + "pass": 4, + "not_encoded": 5 + }, + "capability_status_counts": { + "real": 8, + "mocked": 1, + "unsupported": 6, + "blocked": 23, + "incomplete": 0, + "wrong_result": 10, + "lifecycle_fail": 0, + "pass": 30, + "not_encoded": 26 + }, + "suite_status_counts": { + "real": 0, + "mocked": 0, + "unsupported": 0, + "blocked": 24, + "incomplete": 0, + "wrong_result": 7, + "lifecycle_fail": 0, + "pass": 27, + "not_encoded": 37 + }, + "scenario_status_counts": { + "real": 0, + "mocked": 0, + "unsupported": 3, + "blocked": 21, + "incomplete": 5, + "wrong_result": 6, + "lifecycle_fail": 1, + "pass": 23, + "not_encoded": 13 + }, + "scenario_position_counts": { + "wins": 10, + "ties": 11, + "loses": 1, + "untested": 50 + }, + "scenario_outcome_counts": { + "win": 10, + "tie": 11, + "loss": 1, + "not_tested": 19, + "blocked": 26, + "non_goal": 5 + } + }, + "adapters": [ + { + "adapter_id": "elf_real_world_memory_fixture", + "project": "ELF", + "adapter_kind": "offline_fixture_response", + "evidence_class": "fixture_backed", + "docker_default": true, + "host_global_installs_required": false, + "overall_status": "blocked", + "setup": { + "status": "pass", + "evidence": "The checked-in real_world_memory fixtures parse and score through the ELF fixture runner.", + "command": "cargo make real-world-memory", + "artifact": "tmp/real-world-memory/real-world-memory-report.json" + }, + "run": { + "status": "blocked", + "evidence": "The current fixture set reports 60 jobs across 16 suites: 53 pass, 0 incomplete, 7 blocked, 0 wrong_result, 0 not_encoded, and 0 unsupported_claim. The six core_archival_memory jobs pass as ELF fixture evidence, not as live Letta comparison evidence; the one memory_summary job passes as fixture-backed source-trace evidence, not as managed-memory parity evidence; the proactive_brief suite scores 4 passing evidence-linked suggestions plus one blocked private-corpus refresh case tied to XY-930, not Pulse or hosted managed-memory parity; the scheduled_memory suite scores 4 passing scheduled readback tasks plus one blocked private/provider scheduler case tied to XY-930, not hosted scheduler, ChatGPT Tasks, Pulse, or provider-backed private-corpus parity; context_trajectory remains blocked behind OpenViking staged-artifact materialization.", + "command": "cargo make real-world-memory", + "artifact": "tmp/real-world-memory/real-world-memory-report.json" + }, + "result": { + "status": "blocked", + "evidence": "This is fixture-backed ELF scoring, not a live external adapter result.", + "artifact": "tmp/real-world-memory/real-world-memory-report.md" + }, + "capabilities": [ + { + "capability": "real_world_job_fixture_scoring", + "status": "real", + "evidence": "The runner scores checked-in real_world_job records with expected evidence, traps, and typed status output." + }, + { + "capability": "live_external_adapter_execution", + "status": "not_encoded", + "evidence": "The ELF fixture response path does not exercise an external memory project runtime." + }, + { + "capability": "docker_isolated_baseline", + "status": "pass", + "evidence": "ELF live baseline runs execute through docker-compose.baseline.yml for retrieval and lifecycle evidence." + } + ], + "suites": [ + { + "suite_id": "trust_source_of_truth", + "status": "pass", + "evidence": "Checked-in source-of-truth rebuild fixture is encoded and passing." + }, + { + "suite_id": "work_resume", + "status": "pass", + "evidence": "Checked-in work-resume fixtures are encoded and passing." + }, + { + "suite_id": "project_decisions", + "status": "pass", + "evidence": "Checked-in project-decision fixtures cover accepted decisions, reversals, current validation gates, rationale, and bounded caveats." + }, + { + "suite_id": "retrieval", + "status": "pass", + "evidence": "Checked-in retrieval fixtures cover alternate phrasing, distractors, multi-hop routing, current-versus-obsolete selection, and minimal context." + }, + { + "suite_id": "memory_evolution", + "status": "pass", + "evidence": "Checked-in memory-evolution fixtures cover current-versus-historical facts and the relation temporal-validity case is encoded." + }, + { + "suite_id": "consolidation", + "status": "pass", + "evidence": "Proposal-only consolidation fixtures are encoded and passing without source mutation." + }, + { + "suite_id": "memory_summary", + "status": "pass", + "evidence": "The source-trace memory summary fixture is encoded and passing with freshness, rationale, tombstone, and unsupported-claim guards." + }, + { + "suite_id": "proactive_brief", + "status": "blocked", + "evidence": "The proactive brief suite scores 4 passing source-linked suggestions and 1 typed private-corpus refresh blocker tied to XY-930." + }, + { + "suite_id": "scheduled_memory", + "status": "blocked", + "evidence": "The scheduled memory suite scores 4 passing source-linked task readbacks with execution trace coverage and 1 typed private/provider scheduler blocker tied to XY-930." + }, + { + "suite_id": "knowledge_compilation", + "status": "pass", + "evidence": "Knowledge page fixtures are encoded and passing with citation and rebuild metrics." + }, + { + "suite_id": "operator_debugging_ux", + "status": "pass", + "evidence": "Operator-debugging fixtures now expose stage attribution and dropped-candidate evidence without raw SQL." + }, + { + "suite_id": "capture_integration", + "status": "pass", + "evidence": "Four redaction, exclusion, source-id, evidence-binding, and capture-boundary fixtures are encoded and passing." + }, + { + "suite_id": "core_archival_memory", + "status": "pass", + "evidence": "Six fixture jobs score core block attachment, scope, provenance, stale-core detection, archival fallback, and project-decision recovery separately from archival note search." + }, + { + "suite_id": "production_ops", + "status": "blocked", + "evidence": "Production-ops fixtures encode restore, Qdrant rebuild, backfill resume, resource-envelope interpretation, OpenViking wrong-result classification, plus typed blocked operator boundaries." + }, + { + "suite_id": "personalization", + "status": "pass", + "evidence": "The scoped preference fixture is encoded and passing." + }, + { + "suite_id": "context_trajectory", + "status": "blocked", + "evidence": "OpenViking staged retrieval, hierarchy selection, and recursive/context expansion fixtures are encoded as blocked until same-corpus evidence ids and staged artifacts are materialized." + } + ], + "scenarios": [], + "evidence": [ + { + "kind": "fixture_dir", + "ref": "apps/elf-eval/fixtures/real_world_memory/", + "status": "real" + }, + { + "kind": "command", + "ref": "cargo make real-world-memory", + "status": "pass" + } + ], + "notes": [ + "This adapter record exists to keep ELF fixture results separate from live external adapter results.", + "The remaining non-pass ELF fixture states are production-ops operator boundaries plus OpenViking context-trajectory measurement gates.", + "Use elf_live_real_world for service-runtime real_world_job evidence; this fixture-backed record must not imply live-service behavior." + ] + }, + { + "adapter_id": "elf_live_real_world", + "project": "ELF", + "adapter_kind": "docker_service_real_world_job", + "evidence_class": "live_real_world", + "docker_default": true, + "host_global_installs_required": false, + "overall_status": "wrong_result", + "setup": { + "status": "pass", + "evidence": "The live adapter task runs inside docker-compose.baseline.yml with Docker-owned Postgres, Qdrant, Cargo, npm, qmd, and cache volumes.", + "command": "cargo make real-world-memory-live-adapters", + "artifact": "tmp/real-world-memory/live-adapters/elf-materialization.json" + }, + "run": { + "status": "wrong_result", + "evidence": "ELF materializes 55 real_world_job adapter_response objects through ElfService, worker indexing, search_raw, live capture/write-policy ingestion, live consolidation proposal review, live knowledge-page rebuild/lint, and operator-debug trace metadata before scoring; the full sweep includes typed wrong_result, blocked, and not_encoded job records.", + "command": "cargo make real-world-memory-live-adapters", + "artifact": "tmp/real-world-memory/live-adapters/elf-report.json" + }, + "result": { + "status": "wrong_result", + "evidence": "The fresh full live sweep scores 55 jobs across all 13 checked-in suites, including live-scored consolidation, knowledge-page, capture/write-policy, and operator-debug suites. This is not a full-suite live pass because memory-evolution, production-ops, core-archival, and context-trajectory gaps remain typed non-pass records.", + "command": "cargo make real-world-memory-live-adapters", + "artifact": "tmp/real-world-memory/live-adapters/elf-report.md" + }, + "capabilities": [ + { + "capability": "real_world_job_adapter", + "status": "pass", + "evidence": "The adapter executes real_world_job prompts after runtime ingestion and writes generated answer artifacts before scoring." + }, + { + "capability": "service_runtime_execution", + "status": "real", + "evidence": "The materializer uses ElfService, Postgres, Qdrant, deterministic providers, worker indexing, and search_raw in Docker." + }, + { + "capability": "targeted_live_pass", + "status": "pass", + "evidence": "The answer-retrieval suites from the original representative slice still pass: work_resume, retrieval, and project_decisions." + }, + { + "capability": "full_suite_live_sweep", + "status": "wrong_result", + "evidence": "The runner now emits per-job and per-suite live records for all 55 checked-in jobs, including the operator-debug fixture tree, but memory_evolution is wrong_result and production/core/context boundaries remain typed non-pass." + }, + { + "capability": "full_suite_live_pass", + "status": "wrong_result", + "evidence": "No full-suite live pass is claimed; generated reports preserve wrong_result, blocked, and not_encoded job outcomes." + }, + { + "capability": "typed_failure_reporting", + "status": "pass", + "evidence": "Adapter setup/runtime limitations are materialized as typed jobs with evidence JSON instead of silent claim upgrades." + } + ], + "suites": [ + { + "suite_id": "trust_source_of_truth", + "status": "pass", + "evidence": "The live adapter retrieved the restore/Qdrant rebuild proof evidence through the service runtime." + }, + { + "suite_id": "work_resume", + "status": "pass", + "evidence": "The live adapter passed 5/5 work_resume jobs through service-runtime evidence retrieval." + }, + { + "suite_id": "retrieval", + "status": "pass", + "evidence": "The live adapter passed 5/5 retrieval jobs through service-runtime evidence retrieval." + }, + { + "suite_id": "project_decisions", + "status": "pass", + "evidence": "The live adapter passed 5/5 project_decisions jobs through service-runtime evidence retrieval." + }, + { + "suite_id": "memory_evolution", + "status": "wrong_result", + "evidence": "The live adapter passed the delete/TTL case but failed five current-versus-historical conflict jobs because retrieval-backed answers did not provide the required historical conflict evidence links." + }, + { + "suite_id": "consolidation", + "status": "pass", + "evidence": "The live adapter creates consolidation runs, materializes proposal jobs through the worker, preserves source lineage and unsupported-claim flags, and applies/defer/discards proposals through review audit transitions." + }, + { + "suite_id": "knowledge_compilation", + "status": "pass", + "evidence": "The live adapter rebuilds derived knowledge pages through ElfService, searches page sections, lints stale source refs after runtime source updates, and emits citation/backlink/unsupported-section page artifacts." + }, + { + "suite_id": "operator_debugging_ux", + "status": "pass", + "evidence": "The full live sweep includes operator_debugging_ux fixtures and emits trace ids, viewer/admin trace-bundle links, replay commands, dropped-candidate visibility, repair-action clarity, and raw_sql_needed=false." + }, + { + "suite_id": "capture_integration", + "status": "pass", + "evidence": "The live adapter passes 4/4 capture_integration jobs through Docker-local ELF ingestion, including capture-boundary classification, excluded evidence ids, source ids in source_ref, write_policy redaction audit counts, evidence binding, and zero secret leakage." + }, + { + "suite_id": "production_ops", + "status": "blocked", + "evidence": "The live adapter sweep does not run backup/restore, private corpus, provider credential, or backfill operations; existing production-ops credential and private-manifest boundaries remain blocked." + }, + { + "suite_id": "personalization", + "status": "pass", + "evidence": "The live adapter retrieved the scoped preference evidence and passed the personalization job." + }, + { + "suite_id": "core_archival_memory", + "status": "not_encoded", + "evidence": "The full live adapter sweep preserves the core/archival fixture gap as typed not_encoded; this issue does not add live core-block attachment/readback materialization." + }, + { + "suite_id": "context_trajectory", + "status": "blocked", + "evidence": "The OpenViking-style context trajectory fixtures remain blocked by live staged-trajectory and recursive-expansion measurement gaps." + } + ], + "scenarios": [ + { + "scenario_id": "live_capture_write_policy", + "suite_id": "capture_integration", + "status": "pass", + "elf_position": "ties", + "comparison_outcome": "tie", + "evidence": "ELF live capture/write-policy jobs pass for redaction, exclusions, source ids, evidence binding, and no secret leakage. This is an ELF self-check, not a win over external hook systems.", + "command": "cargo make real-world-memory-live-adapters", + "artifact": "tmp/real-world-memory/live-adapters/elf-materialization.json" + }, + { + "scenario_id": "live_consolidation_proposal_review", + "suite_id": "consolidation", + "status": "pass", + "elf_position": "ties", + "comparison_outcome": "tie", + "evidence": "ELF live consolidation jobs now exercise source lineage, unsupported-claim flags, and apply/defer/discard review audit transitions. This is an ELF service self-check, not a broad competitor win.", + "command": "cargo make real-world-memory-live-adapters", + "artifact": "tmp/real-world-memory/live-adapters/elf-materialization.json" + }, + { + "scenario_id": "live_knowledge_page_rebuild_lint", + "suite_id": "knowledge_compilation", + "status": "pass", + "elf_position": "ties", + "comparison_outcome": "tie", + "evidence": "ELF live knowledge jobs now exercise page rebuild, search, stale-source lint, citations, backlinks, and unsupported-section handling. This is an ELF service self-check, not a broad knowledge-product win.", + "command": "cargo make real-world-memory-live-adapters", + "artifact": "tmp/real-world-memory/live-adapters/elf-materialization.json" + }, + { + "scenario_id": "full_sweep_operator_debug", + "suite_id": "operator_debugging_ux", + "status": "pass", + "elf_position": "wins", + "comparison_outcome": "win", + "evidence": "ELF full live sweep now includes the operator-debug fixture tree with hydrated trace ids, trace-bundle replay commands, dropped-candidate visibility, repair guidance, and no raw SQL requirement.", + "command": "cargo make real-world-memory-live-adapters", + "artifact": "tmp/real-world-memory/live-adapters/elf-materialization.json" + } + ], + "evidence": [ + { + "kind": "fixture_dir", + "ref": "apps/elf-eval/fixtures/real_world_memory/", + "status": "real" + }, + { + "kind": "fixture_dir", + "ref": "apps/elf-eval/fixtures/real_world_job/operator_debugging_ux/", + "status": "real" + }, + { + "kind": "command", + "ref": "cargo make real-world-memory-live-adapters", + "status": "pass" + }, + { + "kind": "artifact", + "ref": "tmp/real-world-memory/live-adapters/elf-report.json", + "status": "pass" + } + ], + "notes": [ + "This Docker-isolated live real_world_job record now covers the full encoded fixture corpus, not only the original three-suite representative slice.", + "The record is a full-suite sweep, not a full-suite pass; wrong_result, blocked, and not_encoded states remain visible.", + "This record does not prove private-corpus production quality or provider-backed production operations." + ] + }, + { + "adapter_id": "qmd_live_baseline", + "project": "qmd", + "adapter_kind": "docker_cli_same_corpus", + "evidence_class": "live_baseline_only", + "docker_default": true, + "host_global_installs_required": false, + "overall_status": "pass", + "setup": { + "status": "pass", + "evidence": "The live-baseline Docker runner installs qmd inside the baseline container.", + "command": "ELF_BASELINE_PROJECTS=qmd cargo make baseline-live-docker", + "artifact": "tmp/live-baseline/qmd.log" + }, + "run": { + "status": "pass", + "evidence": "qmd same-corpus retrieval, update, delete, and cold-start checks are encoded in the live baseline runner.", + "command": "ELF_BASELINE_PROJECTS=qmd cargo make baseline-live-docker", + "artifact": "tmp/live-baseline/live-baseline-report.json" + }, + "result": { + "status": "pass", + "evidence": "This live_baseline_only record is same-corpus evidence only; cite qmd_live_real_world for the full live real-world sweep.", + "artifact": "docs/runbook/benchmarking/live_baseline_benchmark.md" + }, + "capabilities": [ + { + "capability": "same_corpus_retrieval", + "status": "pass", + "evidence": "qmd has an encoded Docker same-corpus retrieval adapter." + }, + { + "capability": "update_delete_cold_start", + "status": "pass", + "evidence": "qmd lifecycle smoke checks are encoded in the live-baseline runner." + }, + { + "capability": "real_world_job_adapter", + "status": "not_encoded", + "evidence": "This live_baseline_only record does not execute real_world_job prompts; cite qmd_live_real_world for the full live real-world sweep." + } + ], + "suites": [ + { + "suite_id": "retrieval", + "status": "not_encoded", + "evidence": "This live_baseline_only record does not execute real_world_job retrieval prompts; cite qmd_live_real_world for the live retrieval adapter run." + }, + { + "suite_id": "memory_evolution", + "status": "not_encoded", + "evidence": "Live-baseline lifecycle checks exist, but no real_world_job memory_evolution run is encoded." + }, + { + "suite_id": "operator_debugging_ux", + "status": "not_encoded", + "evidence": "qmd debug ergonomics are a reference dimension; no operator_debugging_ux fixture is executed against qmd." + } + ], + "scenarios": [], + "evidence": [ + { + "kind": "runner", + "ref": "scripts/live-baseline-benchmark.sh", + "status": "real" + }, + { + "kind": "compose", + "ref": "docker-compose.baseline.yml", + "status": "real" + } + ], + "notes": [ + "This same-corpus record remains separate from qmd_live_real_world, which records real_world_job prompt execution and scoring evidence." + ] + }, + { + "adapter_id": "qmd_live_real_world", + "project": "qmd", + "adapter_kind": "docker_cli_real_world_job", + "evidence_class": "live_real_world", + "docker_default": true, + "host_global_installs_required": false, + "overall_status": "wrong_result", + "setup": { + "status": "pass", + "evidence": "The live adapter task clones and installs qmd inside the baseline Docker container when the checkout is absent.", + "command": "cargo make real-world-memory-live-adapters", + "artifact": "tmp/real-world-memory/live-adapters/qmd-materialization.json" + }, + "run": { + "status": "wrong_result", + "evidence": "qmd materializes 55 real_world_job adapter_response objects through collection add, update, embed, and query --json before scoring; the full sweep includes typed wrong_result, blocked, and not_encoded job records, with operator-debug fixtures scored through qmd replay metadata rather than ELF trace hydration.", + "command": "cargo make real-world-memory-live-adapters", + "artifact": "tmp/real-world-memory/live-adapters/qmd-report.json" + }, + "result": { + "status": "wrong_result", + "evidence": "The fresh full qmd live sweep scores 55 jobs across all 13 checked-in suites, preserving consolidation, knowledge-page, capture, production-ops, core-archival, and context-trajectory gaps as typed non-pass records. This is not a full-suite live pass.", + "command": "cargo make real-world-memory-live-adapters", + "artifact": "tmp/real-world-memory/live-adapters/qmd-report.md" + }, + "capabilities": [ + { + "capability": "real_world_job_adapter", + "status": "pass", + "evidence": "qmd executes real_world_job prompts through its local CLI retrieval/query workflow and records generated answer artifacts." + }, + { + "capability": "local_cli_retrieval", + "status": "real", + "evidence": "The adapter uses qmd collection add, update, embed -f, and query --json inside Docker." + }, + { + "capability": "targeted_live_pass", + "status": "pass", + "evidence": "The answer-retrieval suites from the original representative slice still pass: work_resume, retrieval, and project_decisions." + }, + { + "capability": "full_suite_live_sweep", + "status": "wrong_result", + "evidence": "The runner now emits per-job and per-suite live records for all 55 checked-in jobs, including the operator-debug fixture tree, but memory_evolution and operator_debugging_ux are wrong_result while non-qmd product surfaces remain typed not_encoded or blocked." + }, + { + "capability": "full_suite_live_pass", + "status": "wrong_result", + "evidence": "No full-suite live pass is claimed; generated reports preserve wrong_result, blocked, and not_encoded job outcomes." + }, + { + "capability": "typed_failure_reporting", + "status": "pass", + "evidence": "qmd setup/runtime limitations are materialized as typed jobs with command evidence and retry artifacts." + } + ], + "suites": [ + { + "suite_id": "trust_source_of_truth", + "status": "pass", + "evidence": "qmd retrieved the restore/Qdrant rebuild proof evidence through the local CLI workflow." + }, + { + "suite_id": "work_resume", + "status": "pass", + "evidence": "qmd passed 5/5 work_resume jobs through CLI evidence retrieval." + }, + { + "suite_id": "retrieval", + "status": "pass", + "evidence": "qmd passed 5/5 retrieval jobs through CLI evidence retrieval." + }, + { + "suite_id": "project_decisions", + "status": "pass", + "evidence": "qmd passed 5/5 project_decisions jobs through CLI evidence retrieval." + }, + { + "suite_id": "memory_evolution", + "status": "wrong_result", + "evidence": "qmd failed all six memory-evolution jobs in the fresh June 11 diagnostic, including the delete/TTL tombstone job where qmd retrieved only the current plan and missed the tombstone evidence." + }, + { + "suite_id": "consolidation", + "status": "not_encoded", + "evidence": "The qmd live adapter sweep retrieves evidence-linked answers but does not generate or review consolidation proposals." + }, + { + "suite_id": "knowledge_compilation", + "status": "not_encoded", + "evidence": "The qmd live adapter sweep retrieves evidence-linked answers but does not generate derived knowledge pages." + }, + { + "suite_id": "operator_debugging_ux", + "status": "wrong_result", + "evidence": "The full qmd live sweep includes operator_debugging_ux fixtures and records replay-command metadata, but it lacks ELF trace hydration, viewer links, and intermediate candidate-drop stages, so the suite remains wrong_result." + }, + { + "suite_id": "capture_integration", + "status": "not_encoded", + "evidence": "The qmd live adapter sweep does not exercise capture integrations or write-policy redaction boundaries; all capture_integration jobs remain typed not_encoded for qmd." + }, + { + "suite_id": "production_ops", + "status": "blocked", + "evidence": "The qmd live adapter sweep does not run backup/restore, private corpus, provider credential, or backfill operations; existing production-ops credential and private-manifest boundaries remain blocked." + }, + { + "suite_id": "personalization", + "status": "pass", + "evidence": "qmd retrieved the scoped preference evidence and passed the personalization job." + }, + { + "suite_id": "core_archival_memory", + "status": "not_encoded", + "evidence": "The qmd live adapter sweep preserves the core/archival fixture gap as typed not_encoded; qmd does not expose ELF core-block attachment/readback materialization." + }, + { + "suite_id": "context_trajectory", + "status": "blocked", + "evidence": "The OpenViking-style context trajectory fixtures remain blocked by live staged-trajectory and recursive-expansion measurement gaps." + } + ], + "scenarios": [], + "evidence": [ + { + "kind": "fixture_dir", + "ref": "apps/elf-eval/fixtures/real_world_memory/", + "status": "real" + }, + { + "kind": "fixture_dir", + "ref": "apps/elf-eval/fixtures/real_world_job/operator_debugging_ux/", + "status": "real" + }, + { + "kind": "command", + "ref": "cargo make real-world-memory-live-adapters", + "status": "pass" + }, + { + "kind": "artifact", + "ref": "tmp/real-world-memory/live-adapters/qmd-report.json", + "status": "pass" + } + ], + "notes": [ + "This qmd record is real-world job evidence and must not be conflated with the same-corpus qmd_live_baseline record.", + "The record is a full-suite sweep, not a full-suite pass; wrong_result, blocked, and not_encoded states remain visible.", + "This record does not prove broad RAG/graph adapter parity or private-corpus production quality." + ] + }, + { + "adapter_id": "elf_operator_debug_live", + "project": "ELF", + "adapter_kind": "docker_service_operator_debug_real_world_job", + "evidence_class": "live_real_world", + "docker_default": true, + "host_global_installs_required": false, + "overall_status": "pass", + "setup": { + "status": "pass", + "evidence": "The narrow operator-debug live task runs inside docker-compose.baseline.yml with Docker-owned Postgres, Qdrant, Cargo, npm, qmd, and cache volumes.", + "command": "cargo make real-world-job-operator-ux-live-adapters", + "artifact": "tmp/real-world-job/operator-ux-live-adapters/elf-materialization.json" + }, + "run": { + "status": "pass", + "evidence": "ELF materializes operator_debugging_ux adapter_response objects through ElfService, worker indexing, search_raw trace ids, and generated operator_debug metadata.", + "command": "cargo make real-world-job-operator-ux-live-adapters", + "artifact": "tmp/real-world-job/operator-ux-live-adapters/elf-report.json" + }, + "result": { + "status": "pass", + "evidence": "The narrow live slice scores operator-debugging jobs with trace availability, replay command availability, candidate-drop visibility, repair-action clarity, and raw-SQL avoidance separated in job-level evidence.", + "command": "cargo make real-world-job-operator-ux-live-adapters", + "artifact": "tmp/real-world-job/operator-ux-live-adapters/elf-report.md" + }, + "capabilities": [ + { + "capability": "operator_debug_real_world_job_adapter", + "status": "pass", + "evidence": "The adapter executes the checked-in operator_debugging_ux jobs through the live service materializer and generated scoring fixtures." + }, + { + "capability": "trace_hydration_metadata", + "status": "pass", + "evidence": "Generated operator_debug records include service trace ids, viewer links, admin trace-bundle URLs, and trace_available=true." + }, + { + "capability": "replay_command_metadata", + "status": "pass", + "evidence": "Generated operator_debug records include admin trace-bundle curl replay commands; no raw SQL path is required." + }, + { + "capability": "candidate_drop_visibility", + "status": "pass", + "evidence": "The operator-debug jobs keep dropped-candidate visibility as explicit job-level evidence instead of relying on direct database inspection." + }, + { + "capability": "openmemory_or_claude_mem_ui_runner", + "status": "not_encoded", + "evidence": "This ELF live slice does not launch OpenMemory or claude-mem UI flows." + } + ], + "suites": [ + { + "suite_id": "operator_debugging_ux", + "status": "pass", + "evidence": "The narrow live operator-debug slice scores trace hydration, stage attribution, candidate-drop visibility, selected-but-not-narrated diagnosis, and repair-action clarity through generated ELF live artifacts." + } + ], + "scenarios": [ + { + "scenario_id": "operator_debug_trace_hydration", + "suite_id": "operator_debugging_ux", + "status": "pass", + "elf_position": "wins", + "comparison_outcome": "win", + "evidence": "ELF generated trace_available=true, service trace ids, viewer URLs, and admin trace-bundle replay URLs for the operator-debug jobs; qmd has replay rows but no ELF trace hydration surface.", + "command": "cargo make real-world-job-operator-ux-live-adapters", + "artifact": "tmp/real-world-job/operator-ux-live-adapters/elf-report.json" + }, + { + "scenario_id": "operator_debug_replay_command", + "suite_id": "operator_debugging_ux", + "status": "pass", + "elf_position": "ties", + "comparison_outcome": "tie", + "evidence": "ELF generated admin trace-bundle replay commands; qmd generated local CLI query replay commands. These are comparable replay-command availability artifacts, not equivalent UI quality claims.", + "command": "cargo make real-world-job-operator-ux-live-adapters", + "artifact": "tmp/real-world-job/operator-ux-live-adapters/summary.json" + }, + { + "scenario_id": "operator_debug_candidate_drop_visibility", + "suite_id": "operator_debugging_ux", + "status": "pass", + "elf_position": "wins", + "comparison_outcome": "win", + "evidence": "ELF generated operator_debug candidate-drop visibility from trace and replay-candidate metadata without direct SQL assumptions; qmd keeps only top-k replay rows and lacks intermediate candidate-drop stages.", + "command": "cargo make real-world-job-operator-ux-live-adapters", + "artifact": "tmp/real-world-job/operator-ux-live-adapters/elf-materialization.json" + }, + { + "scenario_id": "operator_debug_repair_action_clarity", + "suite_id": "operator_debugging_ux", + "status": "pass", + "elf_position": "ties", + "comparison_outcome": "tie", + "evidence": "ELF and qmd generated clear repair/replay steps for the narrow operator-debug jobs; OpenMemory UI/export remains blocked, and claude-mem UI repair paths remain blocked until Docker-contained hook/viewer evidence exists.", + "command": "cargo make real-world-job-operator-ux-live-adapters", + "artifact": "tmp/real-world-job/operator-ux-live-adapters/summary.json" + }, + { + "scenario_id": "operator_debug_selected_but_not_narrated", + "suite_id": "operator_debugging_ux", + "status": "pass", + "elf_position": "wins", + "comparison_outcome": "win", + "evidence": "The new selected-but-not-narrated job scores whether selected trace evidence is available for answer-composition repair without direct database inspection.", + "command": "cargo make real-world-job-operator-ux-live-adapters", + "artifact": "tmp/real-world-job/operator-ux-live-adapters/elf-report.json" + } + ], + "evidence": [ + { + "kind": "fixture_dir", + "ref": "apps/elf-eval/fixtures/real_world_job/operator_debugging_ux/", + "status": "real" + }, + { + "kind": "command", + "ref": "cargo make real-world-job-operator-ux-live-adapters", + "status": "pass" + }, + { + "kind": "artifact", + "ref": "tmp/real-world-job/operator-ux-live-adapters/elf-report.json", + "status": "pass" + } + ], + "notes": [ + "This is a narrow operator-debug live slice, not a full-suite live pass.", + "The record does not implement product UI improvements and does not claim broad qmd/OpenMemory/claude-mem superiority." + ] + }, + { + "adapter_id": "qmd_operator_debug_live", + "project": "qmd", + "adapter_kind": "docker_cli_operator_debug_real_world_job", + "evidence_class": "live_real_world", + "docker_default": true, + "host_global_installs_required": false, + "overall_status": "wrong_result", + "setup": { + "status": "pass", + "evidence": "The narrow operator-debug live task clones and installs qmd inside the baseline Docker container when the checkout is absent.", + "command": "cargo make real-world-job-operator-ux-live-adapters", + "artifact": "tmp/real-world-job/operator-ux-live-adapters/qmd-materialization.json" + }, + "run": { + "status": "wrong_result", + "evidence": "qmd materializes operator_debugging_ux adapter_response objects through collection add, update, embed, and query --json, then records local replay-command metadata but no service trace hydration.", + "command": "cargo make real-world-job-operator-ux-live-adapters", + "artifact": "tmp/real-world-job/operator-ux-live-adapters/qmd-report.json" + }, + "result": { + "status": "wrong_result", + "evidence": "The narrow live slice gives qmd explicit replay-command evidence, but operator-debug jobs remain wrong_result where trace availability, trace completeness, or candidate-drop stage visibility is required.", + "command": "cargo make real-world-job-operator-ux-live-adapters", + "artifact": "tmp/real-world-job/operator-ux-live-adapters/qmd-report.md" + }, + "capabilities": [ + { + "capability": "operator_debug_real_world_job_adapter", + "status": "pass", + "evidence": "The adapter executes the checked-in operator_debugging_ux jobs through qmd local CLI materialization and generated scoring fixtures." + }, + { + "capability": "local_replay_command_metadata", + "status": "pass", + "evidence": "Generated operator_debug records include qmd query replay commands tied to per-job collections." + }, + { + "capability": "trace_hydration_metadata", + "status": "wrong_result", + "evidence": "Generated qmd operator_debug records have trace_available=false and no ELF viewer/admin trace bundle because qmd exposes local replay rows rather than service trace hydration." + }, + { + "capability": "candidate_drop_visibility", + "status": "wrong_result", + "evidence": "qmd top-k replay output is available, but intermediate candidate-drop stages are not exposed in the generated artifact." + }, + { + "capability": "openmemory_or_claude_mem_ui_runner", + "status": "not_encoded", + "evidence": "This qmd live slice does not launch OpenMemory or claude-mem UI flows." + } + ], + "suites": [ + { + "suite_id": "operator_debugging_ux", + "status": "wrong_result", + "evidence": "The narrow qmd operator-debug slice scores local replay commands but remains wrong_result for trace hydration and candidate-drop stage visibility." + } + ], + "scenarios": [ + { + "scenario_id": "operator_debug_trace_hydration", + "suite_id": "operator_debugging_ux", + "status": "wrong_result", + "elf_position": "wins", + "comparison_outcome": "win", + "evidence": "qmd generated replay-command metadata but trace_available=false, so ELF wins only this trace-hydration dimension; this is not a broad qmd loss.", + "command": "cargo make real-world-job-operator-ux-live-adapters", + "artifact": "tmp/real-world-job/operator-ux-live-adapters/qmd-report.json" + }, + { + "scenario_id": "operator_debug_replay_command", + "suite_id": "operator_debugging_ux", + "status": "pass", + "elf_position": "ties", + "comparison_outcome": "tie", + "evidence": "qmd generated local CLI query replay commands for the same operator-debugging scenarios; ELF generated admin trace-bundle curl commands.", + "command": "cargo make real-world-job-operator-ux-live-adapters", + "artifact": "tmp/real-world-job/operator-ux-live-adapters/summary.json" + }, + { + "scenario_id": "operator_debug_candidate_drop_visibility", + "suite_id": "operator_debugging_ux", + "status": "wrong_result", + "elf_position": "wins", + "comparison_outcome": "win", + "evidence": "qmd generated top-k replay output but not intermediate retrieved-but-dropped stage visibility, so candidate-drop diagnosis remains a qmd wrong_result in this narrow slice.", + "command": "cargo make real-world-job-operator-ux-live-adapters", + "artifact": "tmp/real-world-job/operator-ux-live-adapters/qmd-materialization.json" + }, + { + "scenario_id": "operator_debug_repair_action_clarity", + "suite_id": "operator_debugging_ux", + "status": "pass", + "elf_position": "ties", + "comparison_outcome": "tie", + "evidence": "qmd generated clear local replay steps for repair investigation, matching ELF on repair-action clarity while differing on trace hydration.", + "command": "cargo make real-world-job-operator-ux-live-adapters", + "artifact": "tmp/real-world-job/operator-ux-live-adapters/qmd-report.json" + }, + { + "scenario_id": "operator_debug_selected_but_not_narrated", + "suite_id": "operator_debugging_ux", + "status": "wrong_result", + "elf_position": "wins", + "comparison_outcome": "win", + "evidence": "qmd can replay top-k rows, but the generated artifact does not expose service trace narration stages for the selected-but-not-narrated diagnosis.", + "command": "cargo make real-world-job-operator-ux-live-adapters", + "artifact": "tmp/real-world-job/operator-ux-live-adapters/qmd-report.json" + } + ], + "evidence": [ + { + "kind": "fixture_dir", + "ref": "apps/elf-eval/fixtures/real_world_job/operator_debugging_ux/", + "status": "real" + }, + { + "kind": "command", + "ref": "cargo make real-world-job-operator-ux-live-adapters", + "status": "wrong_result" + }, + { + "kind": "artifact", + "ref": "tmp/real-world-job/operator-ux-live-adapters/qmd-report.json", + "status": "wrong_result" + } + ], + "notes": [ + "This is a narrow operator-debug live slice, not a full-suite live pass.", + "qmd's replay-command availability remains useful; the wrong_result status is limited to trace hydration and candidate-drop stage visibility." + ] + }, + { + "adapter_id": "agentmemory_live_baseline", + "project": "agentmemory", + "adapter_kind": "docker_sdk_mock_same_corpus", + "evidence_class": "live_baseline_only", + "docker_default": true, + "host_global_installs_required": false, + "overall_status": "lifecycle_fail", + "setup": { + "status": "pass", + "evidence": "The live-baseline Docker runner installs and exercises agentmemory package APIs.", + "command": "ELF_BASELINE_PROJECTS=agentmemory cargo make baseline-live-docker", + "artifact": "tmp/live-baseline/agentmemory.log" + }, + "run": { + "status": "lifecycle_fail", + "evidence": "Same-corpus retrieval can run, but durable lifecycle behavior is not proven because the adapter uses an in-memory SDK/KV mock.", + "artifact": "tmp/live-baseline/live-baseline-report.json" + }, + "result": { + "status": "lifecycle_fail", + "evidence": "agentmemory remains a reference for capture and continuity UX, but current Docker evidence is not a durable lifecycle pass.", + "artifact": "tmp/live-baseline/live-baseline-report.json" + }, + "capabilities": [ + { + "capability": "same_corpus_retrieval", + "status": "pass", + "evidence": "The current adapter can run mem::remember and mem::search against the shared corpus." + }, + { + "capability": "adapter_storage", + "status": "mocked", + "evidence": "The current adapter uses a process-local StateKV Map and in-memory index." + }, + { + "capability": "durable_cold_start", + "status": "blocked", + "evidence": "A persistent upstream KV/index path or hosted runtime is needed before cold-start recovery can be fairly scored." + }, + { + "capability": "durable_work_resume_capture_path", + "status": "blocked", + "evidence": "XY-925 selects the next local path as a Docker-contained agentmemory session directory with persisted SDK KV store, observation log, and searchable index across a fresh process; the current StateKV Map and in-memory index still block scoring." + }, + { + "capability": "write_policy_hook_capture", + "status": "blocked", + "evidence": "Capture/write-policy jobs require live agentmemory hook observations plus persisted write-policy audit evidence. The current adapter does not execute those hooks." + }, + { + "capability": "real_world_job_adapter", + "status": "blocked", + "evidence": "XY-925 adds fixture-backed blocked prompt coverage for the required durable path, but no live agentmemory real_world_job adapter executes prompts until the persistent local store exists." + } + ], + "suites": [ + { + "suite_id": "work_resume", + "status": "blocked", + "evidence": "A durable upstream agentmemory session/capture path is required before work-resume jobs can be compared fairly." + }, + { + "suite_id": "capture_integration", + "status": "blocked", + "evidence": "The current fixture import boundary is offline and does not run live agentmemory hooks." + }, + { + "suite_id": "memory_evolution", + "status": "blocked", + "evidence": "Durable update/supersede/delete history is not proven by the in-memory adapter." + } + ], + "scenarios": [ + { + "scenario_id": "basic_same_corpus_retrieval", + "suite_id": "retrieval", + "status": "pass", + "elf_position": "untested", + "comparison_outcome": "not_tested", + "evidence": "Fresh comparable baseline run live-baseline-20260611061612 reports agentmemory retrieval_pass with 3/3 same-corpus retrieval checks through mem::remember and mem::search. This is live-baseline-only evidence through an in-memory mock, not a real_world_job suite pass.", + "command": "ELF_BASELINE_PROJECTS=ELF,agentmemory,mem0,memsearch,claude-mem cargo make baseline-live-docker", + "artifact": "tmp/live-baseline/live-baseline-report.json" + }, + { + "scenario_id": "durable_update_reload_lifecycle", + "suite_id": "memory_evolution", + "status": "lifecycle_fail", + "elf_position": "wins", + "comparison_outcome": "win", + "evidence": "Fresh comparable baseline run live-baseline-20260611061612 reports ELF passing 8/8 local lifecycle checks, while agentmemory update_replaces_note_text is lifecycle_fail and cold_start_recovery_search is blocked because the harness uses an in-memory SDK/KV mock. This is an ELF baseline win only at the local lifecycle-smoke evidence class.", + "command": "ELF_BASELINE_PROJECTS=ELF,agentmemory,mem0,memsearch,claude-mem cargo make baseline-live-docker", + "artifact": "tmp/live-baseline/live-baseline-report.json" + }, + { + "scenario_id": "work_resume_capture_continuity", + "suite_id": "work_resume", + "status": "blocked", + "elf_position": "untested", + "comparison_outcome": "blocked", + "evidence": "agentmemory's relevant strength is durable coding-agent continuity and capture, but the Docker harness has not proven a persistent session/capture path. XY-925 selects the durable local path as a Docker-contained session directory that persists the SDK KV store and searchable index across a fresh process; keep work_resume and capture claims blocked until that path exists.", + "command": "cargo make real-world-first-generation-oss", + "artifact": "tmp/real-world-memory/first-generation-oss/report.json" + }, + { + "scenario_id": "durable_work_resume_local_path", + "suite_id": "work_resume", + "status": "blocked", + "elf_position": "untested", + "comparison_outcome": "blocked", + "evidence": "The selected comparable path is explicit: capture into a Docker-local agentmemory session directory, persist the SDK KV/index and observation log, restart a fresh process, then score work_resume prompts. The checked-in fixture records this as blocked rather than scoring the current mock.", + "command": "cargo make real-world-first-generation-oss", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/first_generation_oss/agentmemory_durable_capture_path_blocked.json" + }, + { + "scenario_id": "capture_write_policy_hooks", + "suite_id": "capture_integration", + "status": "blocked", + "elf_position": "untested", + "comparison_outcome": "blocked", + "evidence": "agentmemory capture/write-policy comparison needs live hook observations and write-policy audit evidence persisted through the selected local store. The fixture preserves this as a typed blocker and does not convert the mem::remember smoke into capture proof.", + "command": "cargo make real-world-first-generation-oss", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/first_generation_oss/agentmemory_durable_capture_path_blocked.json" + } + ], + "evidence": [ + { + "kind": "evidence", + "ref": "docs/evidence/external_memory/agentmemory_adapter.md", + "status": "real" + }, + { + "kind": "runner", + "ref": "scripts/live-baseline-benchmark.sh", + "status": "mocked" + } + ], + "notes": [ + "The offline agentmemory fixture adapter is an import/comparison boundary and must not be treated as live benchmark proof." + ], + "follow_up": { + "title": "[ELF benchmark P0] Make agentmemory adapter lifecycle-durable and fail-typed", + "reason": "A durable upstream agentmemory storage path is required before lifecycle and real-world job suites can be fairly scored." + } + }, + { + "adapter_id": "mem0_openmemory_live_baseline", + "project": "mem0/OpenMemory", + "adapter_kind": "docker_sdk_same_corpus", + "evidence_class": "live_baseline_only", + "docker_default": true, + "host_global_installs_required": false, + "overall_status": "pass", + "setup": { + "status": "pass", + "evidence": "The live-baseline Docker runner can install mem0 and configure local FastEmbed/Qdrant paths.", + "command": "ELF_BASELINE_PROJECTS=mem0 cargo make baseline-live-docker", + "artifact": "tmp/live-baseline/mem0.log" + }, + "run": { + "status": "pass", + "evidence": "Fresh scoped baseline run live-baseline-20260611122416 exercises local OSS mem0 with FastEmbed, Qdrant path storage, Memory.update, Memory.delete, Memory.history, Memory.get_all, entity filters, and cold-start reload; mem0 passed 8/8 encoded SDK checks. XY-931 adds a separate OpenMemory export-helper setup probe artifact and keeps that blocked UI/export result out of the SDK check summary.", + "command": "cargo make openmemory-ui-export-readback", + "artifact": "tmp/live-baseline/live-baseline-report.json" + }, + "result": { + "status": "pass", + "evidence": "The local OSS mem0 baseline now passes same-corpus retrieval, update/delete/reload, preference correction history, entity-scoped personalization, local get_all export-style readback, and deletion audit history. The separate OpenMemory export-helper setup probe is blocked because Docker is unavailable inside the baseline-runner container before any product app database readback can run. It still does not claim hosted Platform export, optional graph memory, or a real_world_job prompt adapter.", + "artifact": "tmp/live-baseline/live-baseline-report.json" + }, + "capabilities": [ + { + "capability": "local_storage", + "status": "real", + "evidence": "The adapter targets local FastEmbed, Qdrant path storage, and local history DB paths in Docker." + }, + { + "capability": "same_corpus_retrieval", + "status": "pass", + "evidence": "Fresh scoped baseline run live-baseline-20260611122416 reports mem0 retrieval_pass with 3/3 same-corpus retrieval checks." + }, + { + "capability": "local_lifecycle_update_delete_reload", + "status": "pass", + "evidence": "The Docker runner exercises public Memory.update, Memory.delete, and a new Memory.from_config over the same local Qdrant/history paths; the fresh scoped run reports those lifecycle checks passing." + }, + { + "capability": "preference_correction_history", + "status": "pass", + "evidence": "The fresh scoped run reports preference_correction_history as pass: Memory.history preserved explicit ADD and UPDATE records with old and current preference text, and search returned only the current correction." + }, + { + "capability": "entity_scoped_personalization", + "status": "pass", + "evidence": "The fresh scoped run reports entity_scoped_personalization as pass: user_id, agent_id, and run_id filters returned the ELF scoped preference and omitted a PubFi scoped preference." + }, + { + "capability": "local_get_all_export_readback", + "status": "pass", + "evidence": "The fresh scoped run reports local_get_all_export_readback as pass: Memory.get_all returned the current scoped preference and omitted the other scope." + }, + { + "capability": "deletion_audit_history", + "status": "pass", + "evidence": "The fresh scoped run reports delete_history_audit_readback as pass: Memory.history exposed a DELETE event and search suppressed the deleted memory." + }, + { + "capability": "openmemory_ui_readback", + "status": "blocked", + "evidence": "XY-931 runs a bounded OpenMemory export-helper setup probe after the mem0 SDK corpus checks. The probe finds the OpenMemory tree, UI package, compose file, and export helper, then records a setup blocker because the export helper requires Docker access to a running OpenMemory container. Local SDK get_all readback is measured separately and must not be reused as UI evidence." + }, + { + "capability": "hosted_managed_memory_claims", + "status": "unsupported", + "evidence": "Hosted mem0 Platform behavior and Platform UI export are outside the local OSS Docker adapter and are non-goals for this local evidence record." + }, + { + "capability": "real_world_job_adapter", + "status": "not_encoded", + "evidence": "No mem0/OpenMemory adapter currently executes real_world_job prompts and answer scoring." + }, + { + "capability": "optional_graph_memory", + "status": "not_encoded", + "evidence": "Optional graph memory is not enabled in the default local OSS path and remains an opt-in scenario gate rather than a default pass/fail claim." + } + ], + "suites": [ + { + "suite_id": "memory_evolution", + "status": "not_encoded", + "evidence": "Scenario-level local OSS checks now measure preference correction history and deletion audit readback, but no mem0 real_world_job memory_evolution prompt adapter is encoded." + }, + { + "suite_id": "personalization", + "status": "not_encoded", + "evidence": "Scenario-level local OSS checks now measure entity-scoped personalization, but no mem0 real_world_job personalization prompt adapter is encoded." + }, + { + "suite_id": "operator_debugging_ux", + "status": "blocked", + "evidence": "Local SDK get_all inspection is measured, but OpenMemory UI/export readback is blocked by the XY-931 export-helper setup probe until a dedicated OpenMemory compose/import path can load the same corpus into the OpenMemory app database." + } + ], + "scenarios": [ + { + "scenario_id": "basic_local_lifecycle", + "suite_id": "memory_evolution", + "status": "pass", + "elf_position": "ties", + "comparison_outcome": "tie", + "evidence": "Prior comparable baseline run live-baseline-20260611061612 reports ELF passing 8/8 local lifecycle checks and mem0 passing basic same-corpus retrieval, update, delete, and cold-start reload checks. This remains a basic local lifecycle tie at the encoded smoke surface and is not reused as history/UI evidence.", + "command": "ELF_BASELINE_PROJECTS=ELF,agentmemory,mem0,memsearch,claude-mem cargo make baseline-live-docker", + "artifact": "tmp/live-baseline/live-baseline-report.json" + }, + { + "scenario_id": "preference_correction_history", + "suite_id": "personalization", + "status": "pass", + "elf_position": "loses", + "comparison_outcome": "loss", + "evidence": "Fresh scoped baseline run live-baseline-20260611122416 reports mem0 preference_correction_history as pass. ELF-side evidence comes from cargo make real-world-memory-live-adapters as summarized in docs/evidence/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md, which records ELF live memory-evolution preference as wrong_result. The current measured comparison is therefore an ELF loss on this history dimension until ELF temporal reconciliation is fixed.", + "command": "mem0: ELF_BASELINE_PROJECTS=mem0 cargo make baseline-live-docker; ELF: cargo make real-world-memory-live-adapters", + "artifact": "mem0: tmp/live-baseline/mem0-checks.json; ELF: tmp/real-world-memory/live-adapters/ and docs/evidence/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md" + }, + { + "scenario_id": "entity_scoped_personalization", + "suite_id": "personalization", + "status": "pass", + "elf_position": "ties", + "comparison_outcome": "tie", + "evidence": "Fresh scoped baseline run live-baseline-20260611122416 reports mem0 entity_scoped_personalization as pass. ELF-side evidence comes from cargo make real-world-memory-live-adapters as summarized in docs/evidence/benchmarking/2026-06-11-competitor-strength-adoption-report.md, which records ELF and qmd passing the encoded personalization slice. This is a measured tie on the current scoped-preference surface.", + "command": "mem0: ELF_BASELINE_PROJECTS=mem0 cargo make baseline-live-docker; ELF: cargo make real-world-memory-live-adapters", + "artifact": "mem0: tmp/live-baseline/mem0-checks.json; ELF: tmp/real-world-memory/live-adapters/ and docs/evidence/benchmarking/2026-06-11-competitor-strength-adoption-report.md" + }, + { + "scenario_id": "delete_audit_readback", + "suite_id": "memory_evolution", + "status": "pass", + "elf_position": "ties", + "comparison_outcome": "tie", + "evidence": "Fresh scoped baseline run live-baseline-20260611122416 reports mem0 delete_history_audit_readback as pass. ELF-side evidence comes from cargo make real-world-memory-live-adapters as summarized in docs/evidence/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md, which records ELF passing the delete/TTL tombstone job. The current measured delete-audit comparison is a tie.", + "command": "mem0: ELF_BASELINE_PROJECTS=mem0 cargo make baseline-live-docker; ELF: cargo make real-world-memory-live-adapters", + "artifact": "mem0: tmp/live-baseline/mem0-checks.json; ELF: tmp/real-world-memory/live-adapters/ and docs/evidence/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md" + }, + { + "scenario_id": "local_get_all_export_readback", + "suite_id": "operator_debugging_ux", + "status": "pass", + "elf_position": "untested", + "comparison_outcome": "not_tested", + "evidence": "Fresh scoped baseline run live-baseline-20260611122416 reports mem0 local_get_all_export_readback as pass. This is local SDK inspection/export-style readback, not OpenMemory UI evidence; ELF has no directly comparable live UI/export scoring row in this run.", + "command": "ELF_BASELINE_PROJECTS=mem0 cargo make baseline-live-docker", + "artifact": "tmp/live-baseline/mem0-checks.json" + }, + { + "scenario_id": "openmemory_ui_export_readback", + "suite_id": "operator_debugging_ux", + "status": "blocked", + "elf_position": "untested", + "comparison_outcome": "blocked", + "evidence": "The XY-931 OpenMemory export-helper setup probe is Docker-contained in the mem0 baseline run. It detects the OpenMemory product tree, UI package, compose file, and export helper, but Docker is unavailable inside the baseline-runner container before the helper can reach a running OpenMemory product container or app database. Basic lifecycle and local SDK get_all readback are not reused as UI/export proof.", + "command": "cargo make openmemory-ui-export-readback", + "artifact": "tmp/live-baseline/mem0-openmemory-ui-export.json" + }, + { + "scenario_id": "hosted_platform_export", + "suite_id": "operator_debugging_ux", + "status": "unsupported", + "elf_position": "untested", + "comparison_outcome": "non_goal", + "evidence": "Hosted mem0 Platform export is explicitly outside the local OSS Docker comparison and is not counted as a local pass, loss, or blocker.", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json" + }, + { + "scenario_id": "optional_graph_memory", + "suite_id": "memory_evolution", + "status": "not_encoded", + "elf_position": "untested", + "comparison_outcome": "non_goal", + "evidence": "Optional graph memory is kept as an opt-in scenario gate. It is not enabled in the default mem0 local OSS run and is not part of the default pass/fail comparison.", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json" + } + ], + "evidence": [ + { + "kind": "runner", + "ref": "scripts/live-baseline-benchmark.sh", + "status": "real" + } + ], + "notes": [ + "Separate local OSS mem0 SDK evidence from OpenMemory product UI/export claims.", + "A blocked OpenMemory export-helper setup probe is not an ELF win or loss until the product app can import and export the same local corpus." + ] + }, + { + "adapter_id": "memsearch_live_baseline", + "project": "memsearch", + "adapter_kind": "docker_cli_same_corpus", + "evidence_class": "live_baseline_only", + "docker_default": true, + "host_global_installs_required": false, + "overall_status": "pass", + "setup": { + "status": "pass", + "evidence": "The live-baseline Docker runner can install memsearch and run its CLI path.", + "command": "ELF_BASELINE_PROJECTS=memsearch cargo make baseline-live-docker", + "artifact": "tmp/live-baseline/memsearch.log" + }, + "run": { + "status": "pass", + "evidence": "Fresh comparable baseline run live-baseline-20260611061612 indexes a per-adapter corpus copy, rewrites and deletes files, reruns memsearch index, and reports memsearch 4/4 encoded checks passing.", + "artifact": "tmp/live-baseline/live-baseline-report.json" + }, + "result": { + "status": "pass", + "evidence": "memsearch now passes the local same-corpus/reindex/update/delete/reload smoke. No real_world_job memsearch prompt adapter is encoded, so Markdown-first behavior remains baseline scenario evidence rather than suite pass evidence.", + "artifact": "tmp/live-baseline/live-baseline-report.json" + }, + "capabilities": [ + { + "capability": "canonical_markdown_store", + "status": "real", + "evidence": "memsearch is tracked as a Markdown-first source-of-truth reference." + }, + { + "capability": "same_corpus_retrieval", + "status": "pass", + "evidence": "Fresh comparable baseline run live-baseline-20260611061612 reports memsearch retrieval_pass with 3/3 same-corpus retrieval checks." + }, + { + "capability": "reindex_update_delete_reload", + "status": "pass", + "evidence": "The runner rewrites auth-memory.md, deletes a second corpus file, reruns memsearch index, and starts fresh memsearch search processes; the fresh scoped run reports update, delete, and cold-start reload passing." + }, + { + "capability": "real_world_job_adapter", + "status": "not_encoded", + "evidence": "XY-925 adds fixture-backed prompt coverage for the Markdown source-store and retrieval-debug jobs, but no live memsearch runtime adapter executes real_world_job prompts and answer scoring." + }, + { + "capability": "markdown_source_store_prompt_jobs", + "status": "pass", + "evidence": "The first-generation OSS fixture slice encodes source-of-truth rebuild/reload and retrieval-debug prompts over the canonical Markdown store while preserving the live-baseline-only evidence boundary." + } + ], + "suites": [ + { + "suite_id": "trust_source_of_truth", + "status": "not_encoded", + "evidence": "The Markdown-first source model passed the local reindex/reload smoke, and XY-925 adds fixture-backed source-of-truth prompt coverage over the canonical Markdown store. No live memsearch runtime adapter executes prompt scoring yet, so this is not a suite pass." + }, + { + "suite_id": "retrieval", + "status": "not_encoded", + "evidence": "The Docker same-corpus check passes, and XY-925 adds fixture-backed retrieval-debug prompt coverage over memsearch CLI replay and Markdown source inspection. No live memsearch runtime adapter executes retrieval prompt scoring yet, so this is not a suite pass." + }, + { + "suite_id": "memory_evolution", + "status": "not_encoded", + "evidence": "Update/delete reindex semantics pass in Docker, but memory_evolution real_world_job prompts are not encoded for memsearch." + } + ], + "scenarios": [ + { + "scenario_id": "canonical_markdown_reindex_reload", + "suite_id": "trust_source_of_truth", + "status": "pass", + "elf_position": "untested", + "comparison_outcome": "not_tested", + "evidence": "Fresh comparable baseline run live-baseline-20260611061612 reports memsearch passed same-corpus retrieval, update reindex, delete suppression, and cold-start reload over a canonical Markdown corpus. ELF has no directly comparable canonical Markdown source-store scenario in this baseline, so the ELF position remains untested.", + "command": "ELF_BASELINE_PROJECTS=ELF,agentmemory,mem0,memsearch,claude-mem cargo make baseline-live-docker", + "artifact": "tmp/live-baseline/live-baseline-report.json" + }, + { + "scenario_id": "markdown_source_store_rebuild_reload_prompt", + "suite_id": "trust_source_of_truth", + "status": "pass", + "elf_position": "untested", + "comparison_outcome": "not_tested", + "evidence": "XY-925 adds a checked-in real_world_job prompt fixture that asks for the memsearch source-of-truth path and rebuild/reload boundary: canonical Markdown files are authoritative, while the index is derived by rerunning memsearch index. This is fixture-backed scenario coverage plus baseline artifact evidence, not a memsearch live real_world_job suite pass.", + "command": "cargo make real-world-first-generation-oss", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/first_generation_oss/memsearch_markdown_rebuild_reload.json" + }, + { + "scenario_id": "markdown_retrieval_debug_prompt", + "suite_id": "operator_debugging_ux", + "status": "pass", + "elf_position": "untested", + "comparison_outcome": "not_tested", + "evidence": "XY-925 adds a checked-in retrieval-debug prompt over memsearch's canonical Markdown store. The expected debug surface is CLI replay plus Markdown source inspection and reindexing; staged expansion/fusion/rerank/candidate-drop trace bundles remain not encoded for memsearch.", + "command": "cargo make real-world-first-generation-oss", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/first_generation_oss/memsearch_retrieval_debug_prompt.json" + }, + { + "scenario_id": "ttl_expiry_lifecycle", + "suite_id": "memory_evolution", + "status": "unsupported", + "elf_position": "untested", + "comparison_outcome": "non_goal", + "evidence": "The encoded memsearch CLI path supports reindex/delete but no TTL or expiry behavior. Unsupported TTL behavior is preserved as unsupported competitor evidence and does not create an ELF win/loss claim without a directly comparable scenario artifact.", + "artifact": "tmp/live-baseline/live-baseline-report.json" + }, + { + "scenario_id": "real_world_prompt_adapter", + "suite_id": "retrieval", + "status": "not_encoded", + "elf_position": "untested", + "comparison_outcome": "not_tested", + "evidence": "No live memsearch runtime adapter currently executes real_world_job prompts and answer scoring. XY-925 fixture-backed prompt jobs document the source-store and retrieval-debug shape, while baseline retrieval/reindex evidence remains separate from suite pass claims.", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json" + } + ], + "evidence": [ + { + "kind": "runner", + "ref": "scripts/live-baseline-benchmark.sh", + "status": "real" + } + ], + "notes": [ + "Do not mark memsearch worse solely because setup or local indexing is heavier; preserve the typed incomplete/wrong-result boundary." + ] + }, + { + "adapter_id": "openviking_live_baseline", + "project": "OpenViking", + "adapter_kind": "docker_local_embed_same_corpus", + "evidence_class": "live_baseline_only", + "docker_default": true, + "host_global_installs_required": false, + "overall_status": "wrong_result", + "setup": { + "status": "pass", + "evidence": "OpenViking local-embed setup installed and imported pinned llama-cpp-python==0.3.28 from the CPU wheel index in Docker.", + "command": "ELF_BASELINE_PROJECTS=OpenViking cargo make baseline-live-docker", + "artifact": "tmp/live-baseline/OpenViking.log" + }, + "run": { + "status": "wrong_result", + "evidence": "The adapter reached same-corpus add_resource/find and now exposes expected/matched/missing evidence ids, but returned 0 of 3 expected evidence-term matches in the smoke run.", + "artifact": "tmp/live-baseline/live-baseline-report.json" + }, + "result": { + "status": "wrong_result", + "evidence": "The current OpenViking Docker evidence is a behavioral wrong_result, not a local embedding setup blocker and not a real_world_job pass.", + "artifact": "docs/runbook/benchmarking/live_baseline_benchmark.md" + }, + "capabilities": [ + { + "capability": "local_embed_setup", + "status": "pass", + "evidence": "Docker local embedding dependency setup is pinned to llama-cpp-python==0.3.28 from https://abetlen.github.io/llama-cpp-python/whl/cpu and reached import/runtime in the smoke run." + }, + { + "capability": "same_corpus_retrieval", + "status": "wrong_result", + "evidence": "OpenViking add_resource/find returned resources but missed expected evidence-term matches for every smoke query." + }, + { + "capability": "context_trajectory", + "status": "blocked", + "evidence": "OpenViking staged/hierarchical retrieval is now encoded as blocked context_trajectory fixtures until same-corpus expected evidence ids match and staged artifacts are materialized." + }, + { + "capability": "real_world_job_adapter", + "status": "not_encoded", + "evidence": "No OpenViking adapter currently executes real_world_job prompts and answer scoring." + } + ], + "suites": [ + { + "suite_id": "retrieval", + "status": "wrong_result", + "evidence": "The Docker-local setup reached add_resource/find, but the retrieval check returned 0/3 expected evidence-term matches." + }, + { + "suite_id": "work_resume", + "status": "not_encoded", + "evidence": "Hierarchical context resume scenarios are not encoded for OpenViking." + }, + { + "suite_id": "context_trajectory", + "status": "blocked", + "evidence": "The staged retrieval, hierarchy selection, and recursive/context expansion fixtures are encoded as blocked behind same-corpus evidence output and staged artifact readback." + } + ], + "scenarios": [], + "evidence": [ + { + "kind": "runner", + "ref": "scripts/live-baseline-benchmark.sh", + "status": "wrong_result" + } + ], + "execution_metadata": { + "sources": [ + { + "label": "OpenViking repository", + "url": "https://github.com/volcengine/OpenViking/", + "evidence": "Official source for OpenViking local context database, resource, and retrieval APIs." + }, + { + "label": "llama-cpp-python CPU wheel index", + "url": "https://abetlen.github.io/llama-cpp-python/whl/cpu", + "evidence": "Official prebuilt CPU wheel index used by the Docker-local embedding pin." + } + ], + "setup_path": "Run ELF_BASELINE_PROJECTS=OpenViking cargo make baseline-live-docker. The runner installs llama-cpp-python==0.3.28 with --only-binary llama-cpp-python from the CPU wheel index before OpenViking add_resource/find.", + "runtime_boundary": "docker-compose.baseline.yml baseline-runner container; no host-global OpenViking, llama-cpp-python, or model service install is required.", + "resource_expectation": "Local embedding setup may download a CPU wheel and model assets; record OpenViking.log, elapsed time, and cache size before claiming adapter quality.", + "retry_guidance": [ + "Use the default pinned CPU wheel path first.", + "Override ELF_BASELINE_OPENVIKING_LLAMA_CPP_PYTHON_VERSION or ELF_BASELINE_OPENVIKING_LLAMA_CPP_PYTHON_INDEX only when the default wheel is unavailable for the Docker platform.", + "Treat install/import failure as incomplete, not wrong_result; treat add_resource/find evidence misses as wrong_result." + ] + }, + "notes": [ + "Record OpenViking as wrong_result now that the pinned Docker local embedding path reaches add_resource/find but misses expected evidence; keep context_trajectory as blocked until staged artifacts exist." + ], + "follow_up": { + "title": "Fix OpenViking evidence-bearing same-corpus retrieval output and materialize staged artifacts", + "reason": "The current adapter reaches add_resource/find and exposes expected evidence ids, but must match evidence ids and return stage/hierarchy/recursive artifacts before trajectory quality can be scored." + } + }, + { + "adapter_id": "claude_mem_live_baseline", + "project": "claude-mem", + "adapter_kind": "docker_repository_same_corpus", + "evidence_class": "live_baseline_only", + "docker_default": true, + "host_global_installs_required": false, + "overall_status": "wrong_result", + "setup": { + "status": "pass", + "evidence": "The live-baseline Docker runner can install and build claude-mem.", + "command": "ELF_BASELINE_PROJECTS=claude-mem cargo make baseline-live-docker", + "artifact": "tmp/live-baseline/claude-mem.log" + }, + "run": { + "status": "wrong_result", + "evidence": "The Docker runner now uses a durable SQLite file, exercises repository update/delete/reopen checks, and reports missed same-corpus or lifecycle evidence as typed non-pass.", + "artifact": "tmp/live-baseline/live-baseline-report.json" + }, + "result": { + "status": "wrong_result", + "evidence": "No real_world_job claude-mem adapter is encoded; progressive disclosure remains a design reference.", + "artifact": "tmp/live-baseline/live-baseline-report.json" + }, + "capabilities": [ + { + "capability": "same_corpus_retrieval", + "status": "wrong_result", + "evidence": "The current Docker adapter did not prove correct same-corpus retrieval." + }, + { + "capability": "durable_storage", + "status": "real", + "evidence": "The runner writes to a Docker-local SQLite file and constructs a new Database plus repository instances for cold-start recovery search." + }, + { + "capability": "repository_lifecycle", + "status": "real", + "evidence": "The runner uses MemoryItemsRepository.update, deletes from the repository-owned memory_items table, and relies on repository FTS triggers for update/delete checks." + }, + { + "capability": "repository_progressive_disclosure", + "status": "real", + "evidence": "The runner verifies search result to getById detail hydration and listSources source evidence on the durable repository path." + }, + { + "capability": "progressive_disclosure_real_world_job", + "status": "pass", + "evidence": "XY-925 adds fixture-backed prompt coverage for the Docker-contained repository progressive-disclosure path: search result to getById detail hydration and listSources evidence on durable SQLite. Hook, timeline, and viewer workflows remain blocked separately." + }, + { + "capability": "retrieval_repair_artifact", + "status": "wrong_result", + "evidence": "The same-corpus retrieval smoke remains wrong_result, and XY-925 records a repair prompt that tells operators to rerun ELF_BASELINE_PROJECTS=claude-mem cargo make baseline-live-docker before inspecting tmp/live-baseline/claude-mem.log and tmp/live-baseline/claude-mem-checks.json." + }, + { + "capability": "hook_capture_viewer_workflow", + "status": "blocked", + "evidence": "The current Docker runner does not launch claude-mem hooks, timeline capture, local viewer readback, or an operator workflow over the same corpus." + } + ], + "suites": [ + { + "suite_id": "work_resume", + "status": "not_encoded", + "evidence": "The durable repository run is encoded, but hook-driven capture and real_world_job work-resume prompts are not proven by that local repository check." + }, + { + "suite_id": "operator_debugging_ux", + "status": "blocked", + "evidence": "XY-925 adds fixture-backed progressive-disclosure and retrieval-repair prompt coverage, but local viewer/operator workflow remains blocked until a Docker-contained viewer or equivalent readback runner exists." + }, + { + "suite_id": "capture_integration", + "status": "blocked", + "evidence": "claude-mem hook capture remains blocked because hooks, timeline capture, and observation workflows are not executed by this runner." + } + ], + "scenarios": [ + { + "scenario_id": "same_corpus_retrieval", + "suite_id": "retrieval", + "status": "wrong_result", + "elf_position": "wins", + "comparison_outcome": "win", + "evidence": "Fresh comparable baseline run live-baseline-20260611061612 reports ELF retrieval_pass and claude-mem same_corpus_retrieval as wrong_result with 0/3 expected query checks passing, while its durable repository setup completed. This is an ELF baseline win for the narrow retrieval smoke scenario.", + "command": "ELF_BASELINE_PROJECTS=ELF,agentmemory,mem0,memsearch,claude-mem cargo make baseline-live-docker", + "artifact": "tmp/live-baseline/live-baseline-report.json" + }, + { + "scenario_id": "retrieval_repair_artifact_path", + "suite_id": "retrieval", + "status": "wrong_result", + "elf_position": "wins", + "comparison_outcome": "win", + "evidence": "XY-925 adds a checked-in repair prompt that preserves the claude-mem wrong_result and names rerun/inspection targets from the reproducible Docker baseline: tmp/live-baseline/claude-mem.log and tmp/live-baseline/claude-mem-checks.json. This is repair evidence for a miss, not a retrieval pass.", + "command": "cargo make real-world-first-generation-oss", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/first_generation_oss/claude_mem_retrieval_repair.json" + }, + { + "scenario_id": "repository_lifecycle_reload", + "suite_id": "memory_evolution", + "status": "pass", + "elf_position": "ties", + "comparison_outcome": "tie", + "evidence": "Fresh comparable baseline run live-baseline-20260611061612 reports ELF passing local lifecycle checks and claude-mem update, delete, and cold-start reload checks passing over a durable Docker-local SQLite repository. This is a local lifecycle-smoke tie, not a hook-driven work-resume or full progressive-disclosure job pass.", + "command": "ELF_BASELINE_PROJECTS=ELF,agentmemory,mem0,memsearch,claude-mem cargo make baseline-live-docker", + "artifact": "tmp/live-baseline/live-baseline-report.json" + }, + { + "scenario_id": "progressive_disclosure_detail_hydration", + "suite_id": "operator_debugging_ux", + "status": "pass", + "elf_position": "untested", + "comparison_outcome": "not_tested", + "evidence": "claude-mem passed the repository-level search-to-detail/source hydration check, which is a useful progressive-disclosure signal. ELF does not have a directly comparable claude-mem-style progressive-disclosure scenario in this baseline, so the ELF position remains untested rather than a loss claim.", + "command": "ELF_BASELINE_PROJECTS=ELF,agentmemory,mem0,memsearch,claude-mem cargo make baseline-live-docker", + "artifact": "tmp/live-baseline/live-baseline-report.json" + }, + { + "scenario_id": "progressive_disclosure_prompt", + "suite_id": "operator_debugging_ux", + "status": "pass", + "elf_position": "untested", + "comparison_outcome": "not_tested", + "evidence": "XY-925 adds fixture-backed prompt coverage that asks for the measured claude-mem progressive-disclosure boundary: repository search results hydrate through getById and listSources on durable SQLite, but hooks, timeline, viewer, and live prompt scoring are not executed.", + "command": "cargo make real-world-first-generation-oss", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/first_generation_oss/claude_mem_progressive_disclosure.json" + }, + { + "scenario_id": "hook_capture_viewer_workflow", + "suite_id": "capture_integration", + "status": "blocked", + "elf_position": "untested", + "comparison_outcome": "blocked", + "evidence": "The Docker baseline uses repository classes only. claude-mem hooks, viewer, timeline, and observation workflows are not executed by the runner, so XY-925 preserves this as a typed blocker rather than not_encoded prose.", + "command": "cargo make real-world-first-generation-oss", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/first_generation_oss/claude_mem_hook_viewer_blocked.json" + }, + { + "scenario_id": "viewer_operator_workflow", + "suite_id": "operator_debugging_ux", + "status": "blocked", + "elf_position": "untested", + "comparison_outcome": "blocked", + "evidence": "A fair claude-mem viewer/operator comparison needs a Docker-contained run that opens the local viewer or equivalent readback over the same durable SQLite corpus and emits timeline, detail hydration, and repair-command artifacts. That path is not available in the current runner.", + "command": "cargo make real-world-first-generation-oss", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/first_generation_oss/claude_mem_hook_viewer_blocked.json" + } + ], + "evidence": [ + { + "kind": "runner", + "ref": "scripts/live-baseline-benchmark.sh", + "status": "real" + } + ], + "notes": [ + "claude-mem remains a UX reference; durable repository checks do not prove hook, viewer, or full real-world progressive-disclosure behavior." + ] + }, + { + "adapter_id": "qmd_deep_profile_gate", + "project": "qmd", + "adapter_kind": "docker_cli_deep_profile_gate", + "evidence_class": "research_gate", + "docker_default": true, + "host_global_installs_required": false, + "overall_status": "not_encoded", + "setup": { + "status": "pass", + "evidence": "qmd already has a Docker CLI live-baseline adapter; this gate records the deeper profile extension before a separate scaled run is claimed.", + "command": "ELF_BASELINE_PROJECTS=qmd ELF_BASELINE_PROFILE=stress cargo make baseline-live-docker", + "artifact": "tmp/live-baseline/qmd.log" + }, + "run": { + "status": "not_encoded", + "evidence": "The XY-899 strength-profile report is checked in, but no new live qmd deep-profile adapter artifact is claimed from it." + }, + "result": { + "status": "not_encoded", + "evidence": "The XY-899 report records qmd scenario-level retrieval/debug/replay outcomes and wrong-result diagnosis taxonomy, while expansion/fusion/rerank scoring remains not_encoded.", + "artifact": "docs/evidence/benchmarking/2026-06-11-qmd-openviking-strength-profile-report.md" + }, + "capabilities": [ + { + "capability": "stress_profile_retrieval_debug", + "status": "not_encoded", + "evidence": "The stress command path exists, but this adapter-pack gate has not published a deep qmd profile result." + }, + { + "capability": "real_world_job_adapter", + "status": "not_encoded", + "evidence": "The qmd live real-world sweep covers the current encoded fixture corpus; expanded retrieval-debug strength suites still need their own materialized adapter run." + }, + { + "capability": "host_global_install_boundary", + "status": "unsupported", + "evidence": "Repository-supported qmd benchmark runs must stay inside docker-compose.baseline.yml and must not require host-global installs." + } + ], + "suites": [ + { + "suite_id": "retrieval", + "status": "not_encoded", + "evidence": "A deeper stress retrieval-debug report is not checked in for this gate." + }, + { + "suite_id": "operator_debugging_ux", + "status": "not_encoded", + "evidence": "qmd query planning and score readback are not yet scored as operator-debugging real_world_job outputs." + } + ], + "scenarios": [], + "evidence": [ + { + "kind": "source", + "ref": "https://github.com/tobi/qmd", + "status": "real" + }, + { + "kind": "runner", + "ref": "scripts/live-baseline-benchmark.sh", + "status": "real" + } + ], + "execution_metadata": { + "sources": [ + { + "label": "qmd repository", + "url": "https://github.com/tobi/qmd", + "evidence": "Official qmd source for local hybrid search, CLI setup, and query behavior." + } + ], + "setup_path": "Use the existing Docker baseline qmd install, collection add, update, embed, and query flow with scale or stress profiles.", + "runtime_boundary": "docker-compose.baseline.yml baseline-runner container with project files and caches inside Docker volumes.", + "resource_expectation": "CPU local embedding and rerank cost scale with corpus size; record elapsed time and qmd log artifacts before claims.", + "retry_guidance": [ + "Run qmd stress profile in Docker and publish the artifact path.", + "Map qmd JSON output to retrieval-debug real_world_job scoring before suite claims." + ], + "research_depth": "D2 reviewed; deep profile not encoded" + }, + "notes": [ + "This gate deepens qmd planning without changing the existing qmd pass evidence from the smoke live baseline." + ] + }, + { + "adapter_id": "openviking_deep_profile_gate", + "project": "OpenViking", + "adapter_kind": "docker_local_embed_context_trajectory_gate", + "evidence_class": "research_gate", + "docker_default": true, + "host_global_installs_required": false, + "overall_status": "blocked", + "setup": { + "status": "pass", + "evidence": "The default pinned OpenViking local embedding dependency path reaches runtime in Docker.", + "command": "ELF_BASELINE_PROJECTS=OpenViking cargo make baseline-live-docker", + "artifact": "tmp/live-baseline/OpenViking.log" + }, + "run": { + "status": "blocked", + "evidence": "The XY-928 context_trajectory fixtures encode staged retrieval, hierarchy selection, and recursive/context expansion as blocked; no live trajectory adapter artifact is claimed." + }, + "result": { + "status": "blocked", + "evidence": "No OpenViking deep context-trajectory result is claimed from the current wrong-result smoke run; the XY-928 fixtures preserve trajectory surfaces as blocked/not_tested.", + "artifact": "docs/evidence/benchmarking/2026-06-11-qmd-openviking-strength-profile-report.md" + }, + "capabilities": [ + { + "capability": "docker_local_embed_setup", + "status": "pass", + "evidence": "The local embedding setup is pinned and reaches import/runtime in Docker." + }, + { + "capability": "hierarchical_context_trajectory", + "status": "blocked", + "evidence": "Stage trajectory scoring is encoded as blocked until the smoke adapter returns evidence-bearing same-corpus output and selected hierarchy/expansion artifacts." + }, + { + "capability": "host_global_install_boundary", + "status": "unsupported", + "evidence": "The adapter pack must not ask operators to install OpenViking dependencies globally on the host." + } + ], + "suites": [ + { + "suite_id": "retrieval", + "status": "wrong_result", + "evidence": "Same-corpus retrieval is still the precondition and remains wrong_result in the live baseline." + }, + { + "suite_id": "context_trajectory", + "status": "blocked", + "evidence": "OpenViking staged retrieval, hierarchy selection, and recursive/context expansion jobs are encoded as blocked fixtures." + }, + { + "suite_id": "operator_debugging_ux", + "status": "not_encoded", + "evidence": "Trajectory readback is a reference feature but not a scored adapter output." + } + ], + "scenarios": [], + "evidence": [ + { + "kind": "source", + "ref": "https://github.com/volcengine/OpenViking/", + "status": "real" + }, + { + "kind": "runner", + "ref": "scripts/live-baseline-benchmark.sh", + "status": "wrong_result" + } + ], + "execution_metadata": { + "sources": [ + { + "label": "OpenViking repository", + "url": "https://github.com/volcengine/OpenViking/", + "evidence": "Official source for OpenViking local context database, resource, and retrieval APIs." + } + ], + "setup_path": "Use the pinned Docker local embedding path from scripts/live-baseline-benchmark.sh, then run OpenViking add_resource/find before any deep profile scoring.", + "runtime_boundary": "docker-compose.baseline.yml baseline-runner container; no host model or compiler setup outside Docker.", + "resource_expectation": "Local embedding setup can download CPU wheels and model assets; record build/import logs, model cache size, and elapsed time.", + "retry_guidance": [ + "Run the default pinned llama-cpp-python==0.3.28 CPU wheel path first.", + "Override the OpenViking llama-cpp-python version or index only when the default wheel is unavailable for the Docker platform.", + "Fix evidence-bearing same-corpus output and materialize selected hierarchy/expansion artifacts before converting blocked context_trajectory fixtures into scored jobs." + ], + "research_depth": "D2 reviewed; local embedding setup pinned; blocked fixtures encoded" + }, + "notes": [ + "OpenViking remains a context-trajectory reference, but this gate prevents a smoke wrong_result or blocked fixture from becoming a deep-profile win claim." + ] + }, + { + "adapter_id": "ragflow_research_gate", + "project": "RAGFlow", + "adapter_kind": "research_gate", + "evidence_class": "research_gate", + "docker_default": true, + "host_global_installs_required": false, + "overall_status": "blocked", + "setup": { + "status": "blocked", + "evidence": "XY-900 promotes the Docker-safe tiny-corpus evidence smoke into a generated real_world_job report while the checked-in row remains smoke-only research_gate evidence.", + "command": "cargo make smoke-ragflow-docker", + "artifact": "tmp/real-world-memory/ragflow-smoke/ragflow-smoke.json" + }, + "run": { + "status": "blocked", + "evidence": "The live path requires explicit resource-envelope opt-in and a local self-hosted RAGFlow API key; setup failures stay typed in the generated smoke artifact.", + "command": "ELF_RAGFLOW_SMOKE_START=1 ELF_RAGFLOW_SMOKE_ACCEPT_RESOURCE_ENVELOPE=1 cargo make smoke-ragflow-docker", + "artifact": "tmp/real-world-memory/ragflow-smoke/memory_projects_manifest.ragflow-smoke.json" + }, + "result": { + "status": "blocked", + "evidence": "The smoke now emits ragflow-report.json and ragflow-report.md from one generated retrieval job. Pass or wrong_result is allowed only when returned reference chunks map to generated evidence ids; resource, setup, and API-key limits remain typed blockers.", + "artifact": "tmp/real-world-memory/ragflow-smoke/ragflow-report.json" + }, + "capabilities": [ + { + "capability": "adapter_candidate_verdict", + "status": "not_encoded", + "evidence": "XY-882 completed D1/D2 feasibility research and marks RAGFlow adapter_candidate; no adapter run is encoded." + }, + { + "capability": "docker_service_setup", + "status": "blocked", + "evidence": "The smoke records official Docker setup, image/disk/startup envelope, CPU/GPU mode, vm.max_map_count handling, provider boundaries, and retry behavior." + }, + { + "capability": "real_world_job_adapter", + "status": "blocked", + "evidence": "One generated retrieval job is scored from the smoke artifact or typed blocked when resource, service, or local API-key boundaries stop execution." + }, + { + "capability": "quality_or_scale_claim", + "status": "not_encoded", + "evidence": "The scored smoke does not claim broad RAGFlow quality, private corpus behavior, scale, or comparative ranking." + } + ], + "suites": [ + { + "suite_id": "retrieval", + "status": "blocked", + "evidence": "The generated retrieval smoke is scored as pass, wrong_result, blocked, or incomplete by ragflow-report.json; the checked-in row remains blocked until live reference chunks map to evidence ids." + }, + { + "suite_id": "knowledge_compilation", + "status": "not_encoded", + "evidence": "RAGFlow knowledge output is not mapped to real_world_job page or citation scoring." + }, + { + "suite_id": "production_ops", + "status": "blocked", + "evidence": "Resource envelope and service startup retry guidance must be documented first." + } + ], + "scenarios": [ + { + "scenario_id": "reference_chunk_citation_mapping", + "suite_id": "retrieval", + "status": "blocked", + "elf_position": "untested", + "comparison_outcome": "blocked", + "evidence": "XY-929 adds a representative blocked fixture for RAGFlow reference-chunk citation scoring. The job must remain blocked until returned reference chunks include generated document ids, chunk ids, content, and document metadata mapped to benchmark evidence ids.", + "command": "cargo make real-world-memory-graph-rag", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/graph_rag/ragflow_reference_chunks_blocked.json" + }, + { + "scenario_id": "retrieval_quality_reference_recall", + "suite_id": "retrieval", + "status": "blocked", + "elf_position": "untested", + "comparison_outcome": "blocked", + "evidence": "XY-1071 keeps RAGFlow retrieval quality blocked until the same generated corpus returns answer text and selected reference chunks whose document ids, chunk ids, content, and metadata map to expected evidence ids; setup or API reachability alone is not retrieval quality evidence.", + "command": "cargo make real-world-memory-graph-rag", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/graph_rag/ragflow_reference_chunks_blocked.json" + }, + { + "scenario_id": "navigation_quality_document_chunks", + "suite_id": "retrieval", + "status": "blocked", + "elf_position": "untested", + "comparison_outcome": "blocked", + "evidence": "RAGFlow document/chunk navigation remains blocked until returned references expose stable document metadata plus chunk identifiers that can be followed back to same-corpus source evidence.", + "command": "cargo make real-world-memory-graph-rag", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/graph_rag/ragflow_reference_chunks_blocked.json" + }, + { + "scenario_id": "answer_faithfulness_reference_chunks", + "suite_id": "retrieval", + "status": "blocked", + "elf_position": "untested", + "comparison_outcome": "blocked", + "evidence": "RAGFlow answer faithfulness is blocked until generated answers can be checked against returned reference chunk content and decoy/stale chunks are absent from cited support.", + "command": "cargo make real-world-memory-graph-rag", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/graph_rag/ragflow_reference_chunks_blocked.json" + }, + { + "scenario_id": "stale_source_behavior", + "suite_id": "retrieval", + "status": "not_encoded", + "elf_position": "untested", + "comparison_outcome": "not_tested", + "evidence": "RAGFlow stale-source replacement, invalidation, or lint behavior is not encoded by the current same-corpus reference-chunk blocker; no stale-source quality claim is made.", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json" + }, + { + "scenario_id": "knowledge_compilation_quality", + "suite_id": "knowledge_compilation", + "status": "not_encoded", + "elf_position": "untested", + "comparison_outcome": "not_tested", + "evidence": "RAGFlow knowledge compilation quality is not scored because no checked-in same-corpus RAGFlow page, section, citation, or stale-source lint artifact exists.", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json" + }, + { + "scenario_id": "private_or_large_corpus_ragflow_quality", + "suite_id": "retrieval", + "status": "not_encoded", + "elf_position": "untested", + "comparison_outcome": "non_goal", + "evidence": "Private corpus, large-corpus, and hosted RAGFlow quality are outside the generated-public Docker representative lane and must not be inferred from smoke reports.", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json" + } + ], + "evidence": [ + { + "kind": "source", + "ref": "https://github.com/infiniflow/ragflow", + "status": "real" + }, + { + "kind": "source", + "ref": "https://ragflow.io/docs/", + "status": "real" + }, + { + "kind": "artifact", + "ref": "tmp/real-world-memory/ragflow-smoke/ragflow-report.json", + "status": "blocked" + }, + { + "kind": "artifact", + "ref": "tmp/real-world-memory/ragflow-smoke/ragflow-report.md", + "status": "blocked" + } + ], + "execution_metadata": { + "sources": [ + { + "label": "RAGFlow repository", + "url": "https://github.com/infiniflow/ragflow", + "evidence": "Official source for RAGFlow service code and Docker Compose setup." + }, + { + "label": "RAGFlow docs", + "url": "https://ragflow.io/docs/", + "evidence": "Official deployment and setup documentation." + }, + { + "label": "RAGFlow HTTP API reference", + "url": "https://raw.githubusercontent.com/infiniflow/ragflow/main/docs/references/http_api_reference.md", + "evidence": "Official reference for OpenAI-compatible responses with reference chunks and document metadata." + } + ], + "setup_path": "Implement a tiny Docker evidence-smoke runner using the official Docker deployment, dataset ingest API, and OpenAI-compatible query API.", + "runtime_boundary": "Run scripts/ragflow-docker-evidence-smoke.sh through cargo make; the live path uses the official RAGFlow Docker Compose service boundary without host-global RAGFlow installs.", + "resource_expectation": "Large multi-service RAG stack; generated artifacts record CPU/GPU mode, memory, disk, image size, expanded disk notes, startup time, vm.max_map_count handling, and provider boundaries before scoring.", + "retry_guidance": [ + "Run cargo make smoke-ragflow-docker first to produce a typed preflight artifact.", + "Start the live path only with ELF_RAGFLOW_SMOKE_START=1 and ELF_RAGFLOW_SMOKE_ACCEPT_RESOURCE_ENVELOPE=1.", + "Keep private corpora and operator-owned provider credentials out of this smoke; map only generated public corpus reference chunks to evidence ids." + ], + "research_depth": "D2 feasibility verdict plus XY-885 evidence-smoke implementation and XY-900 scored smoke promotion; checked-in record remains research_gate unless a generated artifact reaches query output" + }, + "notes": [ + "Status class: smoke-only scored adapter path with typed resource/setup/API-key blockers.", + "Do not interpret ragflow-report.json as broad RAGFlow quality evidence unless reference chunks map to generated evidence ids." + ], + "follow_up": { + "title": "[ELF benchmark adapter] Implement RAGFlow Docker evidence-smoke adapter", + "reason": "Created as XY-885. XY-882 found a Docker boundary and reference-chunk output contract; implementation must prove a tiny ingest/query run before any quality claim." + } + }, + { + "adapter_id": "lightrag_research_gate", + "project": "LightRAG", + "adapter_kind": "research_gate", + "evidence_class": "research_gate", + "docker_default": true, + "host_global_installs_required": false, + "overall_status": "blocked", + "setup": { + "status": "blocked", + "evidence": "XY-886 adds a Docker-profile context-export smoke command, and XY-900 keeps its generated retrieval fixtures scored through real_world_job_benchmark. The checked-in row remains smoke-only research_gate evidence.", + "command": "cargo make smoke-lightrag-docker-context", + "artifact": "tmp/real-world-memory/lightrag-context/lightrag-materialization.json" + }, + "run": { + "status": "blocked", + "evidence": "The default smoke records a typed setup/runtime failure if the LightRAG API is unavailable; set ELF_LIGHTRAG_CONTEXT_START=1 to start the opt-in Docker service profile.", + "command": "ELF_LIGHTRAG_CONTEXT_START=1 cargo make smoke-lightrag-docker-context", + "artifact": "tmp/real-world-memory/lightrag-context/summary.json" + }, + "result": { + "status": "blocked", + "evidence": "The smoke emits lightrag-report.json and lightrag-report.md over generated retrieval jobs. Pass or wrong_result is allowed only when returned context, references, or file paths map to generated evidence ids.", + "artifact": "tmp/real-world-memory/lightrag-context/lightrag-report.json" + }, + "capabilities": [ + { + "capability": "docker_service_setup", + "status": "blocked", + "evidence": "The opt-in compose profile records explicit LightRAG image, LLM, embedding, rerank, workspace, and Docker volume configuration without host-global installs." + }, + { + "capability": "retrieved_context_export", + "status": "blocked", + "evidence": "The materializer calls /documents/texts, waits on /documents/track_status, and queries /query with only_need_context plus chunk references when the service is reachable." + }, + { + "capability": "real_world_job_adapter", + "status": "blocked", + "evidence": "The LightRAG materializer rewrites generated retrieval fixtures with adapter_response evidence only when source paths or context map to required evidence ids." + }, + { + "capability": "quality_or_scale_claim", + "status": "not_encoded", + "evidence": "The smoke does not score broad graph-RAG quality, private corpora, scale, or comparative ranking claims." + } + ], + "suites": [ + { + "suite_id": "retrieval", + "status": "blocked", + "evidence": "The generated smoke can exercise retrieval context/source mapping for retrieval fixtures, but the checked-in record stays blocked until a live artifact reaches query output." + }, + { + "suite_id": "memory_evolution", + "status": "not_encoded", + "evidence": "LightRAG update/delete/current-versus-historical behavior is not encoded by the context-export smoke." + }, + { + "suite_id": "operator_debugging_ux", + "status": "not_encoded", + "evidence": "The smoke records context/source mappings, but full trace or viewer diagnostics are not mapped to benchmark scoring." + } + ], + "scenarios": [ + { + "scenario_id": "context_source_reference_mapping", + "suite_id": "retrieval", + "status": "incomplete", + "elf_position": "untested", + "comparison_outcome": "blocked", + "evidence": "XY-929 adds a representative incomplete fixture for LightRAG context/source-reference scoring. The job cannot score until the opt-in Docker API exports generated source file paths, snippets, or reference content.", + "command": "cargo make real-world-memory-graph-rag", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/graph_rag/lightrag_context_sources_incomplete.json" + }, + { + "scenario_id": "retrieval_quality_context_recall", + "suite_id": "retrieval", + "status": "incomplete", + "elf_position": "untested", + "comparison_outcome": "blocked", + "evidence": "XY-1071 keeps LightRAG retrieval quality incomplete until the opt-in Docker API exports same-corpus context or references that can be joined to expected evidence ids; service startup alone is not a retrieval-quality result.", + "command": "cargo make real-world-memory-graph-rag", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/graph_rag/lightrag_context_sources_incomplete.json" + }, + { + "scenario_id": "citation_quality_context_references", + "suite_id": "retrieval", + "status": "incomplete", + "elf_position": "untested", + "comparison_outcome": "blocked", + "evidence": "LightRAG citation quality is incomplete until returned context, references.file_path, references.content, or equivalent source snippets map to generated evidence ids.", + "command": "cargo make real-world-memory-graph-rag", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/graph_rag/lightrag_context_sources_incomplete.json" + }, + { + "scenario_id": "navigation_quality_graph_context", + "suite_id": "retrieval", + "status": "incomplete", + "elf_position": "untested", + "comparison_outcome": "blocked", + "evidence": "LightRAG graph/context navigation remains incomplete until exported context exposes source paths or graph-derived source snippets that can be followed back to same-corpus evidence.", + "command": "cargo make real-world-memory-graph-rag", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/graph_rag/lightrag_context_sources_incomplete.json" + }, + { + "scenario_id": "answer_faithfulness_context_refs", + "suite_id": "retrieval", + "status": "incomplete", + "elf_position": "untested", + "comparison_outcome": "blocked", + "evidence": "LightRAG answer faithfulness is incomplete until generated answers and only_need_context output can be checked for required evidence, decoy exclusion, and source-reference alignment.", + "command": "cargo make real-world-memory-graph-rag", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/graph_rag/lightrag_context_sources_incomplete.json" + }, + { + "scenario_id": "stale_source_behavior", + "suite_id": "retrieval", + "status": "not_encoded", + "elf_position": "untested", + "comparison_outcome": "not_tested", + "evidence": "LightRAG stale-source replacement, invalidation, or lint behavior is not encoded by the current context-source blocker; no stale-source quality claim is made.", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json" + }, + { + "scenario_id": "knowledge_compilation_quality", + "suite_id": "knowledge_compilation", + "status": "not_encoded", + "elf_position": "untested", + "comparison_outcome": "not_tested", + "evidence": "LightRAG knowledge compilation quality is not scored because no checked-in same-corpus page, section, citation, or stale-source lint artifact exists.", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json" + }, + { + "scenario_id": "graph_rag_navigation_quality", + "suite_id": "retrieval", + "status": "not_encoded", + "elf_position": "untested", + "comparison_outcome": "not_tested", + "evidence": "LightRAG graph-RAG navigation quality remains not_tested beyond the context-source output contract; no ELF win, tie, or loss is claimed.", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json" + } + ], + "evidence": [ + { + "kind": "source", + "ref": "https://github.com/HKUDS/LightRAG", + "status": "real" + }, + { + "kind": "source", + "ref": "https://github.com/HKUDS/LightRAG/blob/main/docs/DockerDeployment.md", + "status": "real" + }, + { + "kind": "command", + "ref": "cargo make smoke-lightrag-docker-context", + "status": "blocked" + }, + { + "kind": "artifact", + "ref": "tmp/real-world-memory/lightrag-context/lightrag-materialization.json", + "status": "blocked" + }, + { + "kind": "artifact", + "ref": "tmp/real-world-memory/lightrag-context/lightrag-report.md", + "status": "blocked" + } + ], + "execution_metadata": { + "sources": [ + { + "label": "LightRAG repository", + "url": "https://github.com/HKUDS/LightRAG", + "evidence": "Official source for LightRAG server, Docker, and retrieval modes." + }, + { + "label": "LightRAG Docker docs", + "url": "https://github.com/HKUDS/LightRAG/blob/main/docs/DockerDeployment.md", + "evidence": "Official Docker deployment reference." + }, + { + "label": "LightRAG API server docs", + "url": "https://github.com/HKUDS/LightRAG/blob/main/docs/LightRAG-API-Server.md", + "evidence": "Official query-mode and context-output reference." + }, + { + "label": "LightRAG core programming docs", + "url": "https://github.com/HKUDS/LightRAG/blob/main/docs/ProgramingWithCore.md", + "evidence": "Official source-id and file-path citation reference." + } + ], + "setup_path": "Run cargo make smoke-lightrag-docker-context for a typed preflight artifact; set ELF_LIGHTRAG_CONTEXT_START=1 to start the opt-in LightRAG Docker profile and attempt live context export.", + "runtime_boundary": "docker-compose.baseline.yml baseline-runner plus opt-in lightrag and lightrag-mock-provider services; generated source files and LightRAG data stay in Docker-mounted artifact paths and Docker volumes.", + "resource_expectation": "The default profile uses the official LightRAG image, a local OpenAI-compatible mock provider, 64-dimensional embeddings, rerank disabled for context queries, cargo/pip/Hugging Face caches, and Docker volumes for rag_storage, inputs, and prompts.", + "retry_guidance": [ + "Run cargo make smoke-lightrag-docker-context first; a missing API must remain a typed incomplete artifact, not a pass claim.", + "Set ELF_LIGHTRAG_CONTEXT_START=1 only when Docker may pull/start the LightRAG service profile.", + "Score retrieval only when returned context, references.file_path, or references.content map to required evidence ids." + ], + "research_depth": "D2 feasibility plus XY-886 context-export implementation and XY-900 scored smoke aggregation; checked-in record remains research_gate unless a generated artifact reaches query output" + }, + "notes": [ + "Status class: smoke-only scored adapter path with typed service/setup blockers.", + "Do not interpret lightrag-report.json as broad graph-RAG quality evidence unless generated source/context mappings score as pass." + ], + "follow_up": { + "title": "[ELF benchmark adapter] Implement LightRAG Docker context-export adapter", + "reason": "Created as XY-886. XY-882 found a Docker service path and context/source mapping contract; implementation must prove evidence export before scoring." + } + }, + { + "adapter_id": "graphrag_research_gate", + "project": "GraphRAG", + "adapter_kind": "research_gate", + "evidence_class": "research_gate", + "docker_default": true, + "host_global_installs_required": false, + "overall_status": "blocked", + "setup": { + "status": "blocked", + "evidence": "XY-900 promotes the Docker-safe generated-corpus GraphRAG smoke into a scored knowledge_compilation report while the checked-in row remains smoke-only research_gate evidence.", + "command": "cargo make smoke-graphrag-docker", + "artifact": "tmp/real-world-memory/graphrag-smoke/graphrag-smoke.json" + }, + "run": { + "status": "blocked", + "evidence": "The default smoke records a typed blocked artifact without model calls; set ELF_GRAPHRAG_SMOKE_RUN=1 with explicit provider configuration to attempt live GraphRAG index/query.", + "command": "ELF_GRAPHRAG_SMOKE_RUN=1 cargo make smoke-graphrag-docker", + "artifact": "tmp/real-world-memory/graphrag-smoke/summary.json" + }, + "result": { + "status": "blocked", + "evidence": "The smoke now emits graphrag-report.json and graphrag-report.md from one generated knowledge_compilation job. Pass or wrong_result is allowed only when GraphRAG output tables map to generated evidence ids.", + "artifact": "tmp/real-world-memory/graphrag-smoke/graphrag-report.json" + }, + "capabilities": [ + { + "capability": "indexing_resource_envelope", + "status": "blocked", + "evidence": "The smoke bounds the generated public corpus, timeout, GraphRAG package, model configuration, cache size, output size, elapsed time, and observed cache entries." + }, + { + "capability": "source_citation_mapping", + "status": "blocked", + "evidence": "The generated artifact maps GraphRAG documents, text_units, communities, community_reports, entities, and relationships parquet rows back to real_world_job evidence ids when available." + }, + { + "capability": "real_world_job_adapter", + "status": "blocked", + "evidence": "The smoke writes a generated real_world_job fixture and scored report; provider/setup limits remain blocked until live GraphRAG output maps to expected evidence ids." + }, + { + "capability": "quality_or_scale_claim", + "status": "not_encoded", + "evidence": "The smoke does not claim broad graph-navigation quality, knowledge-synthesis quality, private corpora, or large-corpus indexing." + } + ], + "suites": [ + { + "suite_id": "knowledge_compilation", + "status": "blocked", + "evidence": "The generated smoke can exercise parquet table source coverage for one tiny knowledge-compilation fixture, but the checked-in record stays blocked until live output exists." + }, + { + "suite_id": "retrieval", + "status": "not_encoded", + "evidence": "The smoke may run local search for reachability, but retrieval quality scoring is not encoded." + }, + { + "suite_id": "production_ops", + "status": "not_encoded", + "evidence": "Resource bounds are recorded, but no production-ops suite scoring is encoded." + }, + { + "suite_id": "memory_evolution", + "status": "not_encoded", + "evidence": "GraphRAG update/delete/current-versus-historical behavior is not encoded by the smoke." + } + ], + "scenarios": [ + { + "scenario_id": "output_table_citation_mapping", + "suite_id": "knowledge_compilation", + "status": "blocked", + "elf_position": "untested", + "comparison_outcome": "blocked", + "evidence": "XY-929 adds a representative blocked fixture for GraphRAG output-table citation scoring. The job requires provider-backed Docker output tables whose document, text-unit, community, report, entity, and relationship identifiers map to generated evidence ids.", + "command": "cargo make real-world-memory-graph-rag", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/graph_rag/graphrag_output_tables_blocked.json" + }, + { + "scenario_id": "retrieval_quality_local_search", + "suite_id": "retrieval", + "status": "not_encoded", + "elf_position": "untested", + "comparison_outcome": "not_tested", + "evidence": "XY-1071 keeps GraphRAG retrieval quality not tested because the current smoke records output-table and local-search reachability contracts but does not score same-corpus retrieval answers beyond mapped output prerequisites.", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json" + }, + { + "scenario_id": "navigation_quality_community_graph", + "suite_id": "knowledge_compilation", + "status": "blocked", + "elf_position": "untested", + "comparison_outcome": "blocked", + "evidence": "GraphRAG community/entity/relationship navigation remains blocked until provider-backed output tables expose community, entity, relationship, text-unit, and document identifiers that map to generated evidence ids.", + "command": "cargo make real-world-memory-graph-rag", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/graph_rag/graphrag_output_tables_blocked.json" + }, + { + "scenario_id": "answer_faithfulness_output_tables", + "suite_id": "knowledge_compilation", + "status": "blocked", + "elf_position": "untested", + "comparison_outcome": "blocked", + "evidence": "GraphRAG answer faithfulness is blocked until summaries or local-search answers can be checked against mapped documents, text units, and community report rows while excluding unsupported or stale claims.", + "command": "cargo make real-world-memory-graph-rag", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/graph_rag/graphrag_output_tables_blocked.json" + }, + { + "scenario_id": "stale_source_behavior", + "suite_id": "knowledge_compilation", + "status": "not_encoded", + "elf_position": "untested", + "comparison_outcome": "not_tested", + "evidence": "GraphRAG stale-source replacement, invalidation, or lint behavior is not encoded by the current output-table blocker; no stale-source quality claim is made.", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json" + }, + { + "scenario_id": "graph_summary_synthesis_quality", + "suite_id": "knowledge_compilation", + "status": "not_encoded", + "elf_position": "untested", + "comparison_outcome": "not_tested", + "evidence": "GraphRAG graph-summary synthesis quality remains not_tested until provider-backed output tables and local-search context are scored beyond the smoke contract.", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json" + } + ], + "evidence": [ + { + "kind": "source", + "ref": "https://github.com/microsoft/graphrag", + "status": "real" + }, + { + "kind": "source", + "ref": "https://microsoft.github.io/graphrag/", + "status": "real" + }, + { + "kind": "command", + "ref": "cargo make smoke-graphrag-docker", + "status": "blocked" + }, + { + "kind": "artifact", + "ref": "tmp/real-world-memory/graphrag-smoke/graphrag-smoke.json", + "status": "blocked" + }, + { + "kind": "artifact", + "ref": "tmp/real-world-memory/graphrag-smoke/graphrag-report.md", + "status": "blocked" + } + ], + "execution_metadata": { + "sources": [ + { + "label": "GraphRAG repository", + "url": "https://github.com/microsoft/graphrag", + "evidence": "Official Microsoft GraphRAG source and setup reference." + }, + { + "label": "GraphRAG docs", + "url": "https://microsoft.github.io/graphrag/", + "evidence": "Official documentation for indexing and querying." + }, + { + "label": "GraphRAG input docs", + "url": "https://microsoft.github.io/graphrag/index/inputs/", + "evidence": "Official input format and document metadata reference." + }, + { + "label": "GraphRAG output tables", + "url": "https://microsoft.github.io/graphrag/index/outputs/", + "evidence": "Official output schema with document, text unit, community, and relationship identifiers." + }, + { + "label": "GraphRAG local search docs", + "url": "https://microsoft.github.io/graphrag/query/local_search/", + "evidence": "Official local-search context and graph traversal reference." + } + ], + "setup_path": "Run cargo make smoke-graphrag-docker for a typed preflight artifact; set ELF_GRAPHRAG_SMOKE_RUN=1 with explicit provider configuration for a live GraphRAG index/query attempt.", + "runtime_boundary": "docker-compose.baseline.yml baseline-runner, container-local Python venv, generated public corpus, and report artifacts under tmp/real-world-memory/graphrag-smoke.", + "resource_expectation": "The default profile uses a generated public corpus capped by ELF_GRAPHRAG_MAX_DOCS and ELF_GRAPHRAG_MAX_INPUT_CHARS, pins GraphRAG through ELF_GRAPHRAG_PACKAGE, and records elapsed time, cache size, output size, and observed cache entries.", + "retry_guidance": [ + "Run cargo make smoke-graphrag-docker first; missing provider configuration must remain a typed blocked artifact, not a pass claim.", + "Enable ELF_GRAPHRAG_SMOKE_RUN=1 only for generated public corpus indexing with explicit provider configuration.", + "Fail typed if source document or text_unit identifiers cannot be mapped to expected evidence IDs." + ], + "research_depth": "D2 feasibility plus XY-887 Docker smoke implementation and XY-900 scored smoke promotion; checked-in record remains research_gate unless a generated artifact reaches GraphRAG output" + }, + "notes": [ + "Status class: smoke-only scored adapter path with typed provider/setup blockers.", + "Do not interpret graphrag-report.json as broad graph-navigation or knowledge-synthesis quality evidence unless output tables map to generated evidence ids." + ], + "follow_up": { + "title": "[ELF benchmark adapter] Implement GraphRAG cost-bounded Docker adapter", + "reason": "Created as XY-887. XY-882 found a Docker-bounded CLI/API path and output-table evidence handles; implementation must stay tiny and cost-recorded." + } + }, + { + "adapter_id": "graphiti_zep_research_gate", + "project": "Graphiti/Zep", + "adapter_kind": "research_gate", + "evidence_class": "research_gate", + "docker_default": true, + "host_global_installs_required": false, + "overall_status": "blocked", + "setup": { + "status": "blocked", + "evidence": "XY-900 promotes the Docker-contained Graphiti/Zep temporal smoke into a scored memory_evolution report while the checked-in row remains smoke-only research_gate evidence.", + "command": "cargo make smoke-graphiti-zep-docker-temporal", + "artifact": "tmp/real-world-memory/graphiti-zep-smoke/graphiti-zep-smoke.json" + }, + "run": { + "status": "blocked", + "evidence": "The default smoke records a typed setup/runtime failure if live execution is not explicitly enabled. Set ELF_GRAPHITI_ZEP_SMOKE_START=1 and ELF_GRAPHITI_ZEP_SMOKE_RUN=1 with explicit provider configuration to start Docker-local FalkorDB and run Graphiti.", + "command": "ELF_GRAPHITI_ZEP_SMOKE_START=1 ELF_GRAPHITI_ZEP_SMOKE_RUN=1 cargo make smoke-graphiti-zep-docker-temporal", + "artifact": "tmp/real-world-memory/graphiti-zep-smoke/summary.json" + }, + "result": { + "status": "blocked", + "evidence": "The smoke now emits graphiti-zep-report.json and graphiti-zep-report.md from one generated memory_evolution job. The default blocker is live-run opt-in disabled; when ELF_GRAPHITI_ZEP_SMOKE_START=1 and ELF_GRAPHITI_ZEP_SMOKE_RUN=1 are set without provider credentials, the blocker is provider_api_key_missing. No hosted Zep service or unrecorded credentials are used.", + "artifact": "tmp/real-world-memory/graphiti-zep-smoke/graphiti-zep-report.json" + }, + "capabilities": [ + { + "capability": "temporal_graph_memory", + "status": "blocked", + "evidence": "The smoke materializes generated current, historical, and rationale facts with validity windows, but the checked-in record stays blocked until a live artifact maps search output." + }, + { + "capability": "docker_graph_store_setup", + "status": "blocked", + "evidence": "The task uses a Docker Compose graphiti-zep profile for FalkorDB and a container-local Python venv; no host-global graph database or hosted Zep service is used." + }, + { + "capability": "real_world_job_adapter", + "status": "blocked", + "evidence": "The generated temporal-validity fixture is scored or typed blocked; live quality evidence requires Graphiti/Zep search output mapped to current and historical evidence ids." + }, + { + "capability": "quality_or_scale_claim", + "status": "not_encoded", + "evidence": "The smoke does not claim broad graph-memory quality, managed Zep service behavior, private-corpus behavior, or large-corpus performance." + } + ], + "suites": [ + { + "suite_id": "memory_evolution", + "status": "blocked", + "evidence": "Generated current/historical relation facts are encoded, but the checked-in manifest stays blocked until the Docker smoke returns validity-window search output." + }, + { + "suite_id": "retrieval", + "status": "not_encoded", + "evidence": "Hybrid graph retrieval reachability is not scored beyond the temporal search smoke." + }, + { + "suite_id": "production_ops", + "status": "not_encoded", + "evidence": "The smoke records setup and provider boundaries but does not encode backup, restore, private corpus, or hosted-service operations." + } + ], + "scenarios": [ + { + "scenario_id": "temporal_validity_window_mapping", + "suite_id": "memory_evolution", + "status": "blocked", + "elf_position": "untested", + "comparison_outcome": "blocked", + "evidence": "XY-929 adds a representative blocked fixture for Graphiti/Zep temporal-validity scoring. The job remains blocked until provider-backed Docker output maps current and historical validity-window facts to generated evidence ids.", + "command": "cargo make real-world-memory-graph-rag", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/graph_rag/graphiti_temporal_validity_blocked.json" + }, + { + "scenario_id": "hosted_zep_temporal_memory", + "suite_id": "memory_evolution", + "status": "unsupported", + "elf_position": "untested", + "comparison_outcome": "non_goal", + "evidence": "Hosted Zep service behavior is outside the Docker-local representative lane; no hosted-service result is used as ELF win/loss evidence.", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json" + } + ], + "evidence": [ + { + "kind": "source", + "ref": "https://github.com/getzep/graphiti", + "status": "real" + }, + { + "kind": "source", + "ref": "https://www.getzep.com/platform/graphiti/", + "status": "real" + }, + { + "kind": "command", + "ref": "cargo make smoke-graphiti-zep-docker-temporal", + "status": "blocked" + }, + { + "kind": "artifact", + "ref": "tmp/real-world-memory/graphiti-zep-smoke/graphiti-zep-smoke.json", + "status": "blocked" + }, + { + "kind": "artifact", + "ref": "tmp/real-world-memory/graphiti-zep-smoke/graphiti-zep-report.md", + "status": "blocked" + } + ], + "execution_metadata": { + "sources": [ + { + "label": "Graphiti repository", + "url": "https://github.com/getzep/graphiti", + "evidence": "Official open-source temporal context graph engine." + }, + { + "label": "Zep Graphiti overview", + "url": "https://www.getzep.com/platform/graphiti/", + "evidence": "Official product documentation for temporal context graph behavior." + }, + { + "label": "Graphiti quick start", + "url": "https://help.getzep.com/graphiti/getting-started/quick-start", + "evidence": "Official setup, episode ingest, and search output reference." + }, + { + "label": "Graphiti FalkorDB configuration", + "url": "https://help.getzep.com/graphiti/configuration/falkor-db-configuration", + "evidence": "Official Docker-local FalkorDB setup reference." + }, + { + "label": "Graphiti fact triples", + "url": "https://help.getzep.com/graphiti/working-with-data/adding-fact-triples", + "evidence": "Official manual fact-triple ingest contract." + } + ], + "setup_path": "Run cargo make smoke-graphiti-zep-docker-temporal for a typed artifact; set ELF_GRAPHITI_ZEP_SMOKE_START=1 ELF_GRAPHITI_ZEP_SMOKE_RUN=1 with explicit provider configuration for a live attempt.", + "runtime_boundary": "docker-compose.baseline.yml baseline-runner plus graphiti-zep FalkorDB profile, container-local Python venv, generated public temporal facts, and report artifacts under tmp/real-world-memory/graphiti-zep-smoke.", + "resource_expectation": "Requires Docker-local FalkorDB plus LLM/embedding configuration; generated artifacts record service startup, storage size, provider boundaries, fact count, and timeout before scoring.", + "retry_guidance": [ + "Run cargo make smoke-graphiti-zep-docker-temporal first to produce a typed blocked artifact.", + "Start the live path only with ELF_GRAPHITI_ZEP_SMOKE_START=1, ELF_GRAPHITI_ZEP_SMOKE_RUN=1, and explicit provider configuration.", + "Treat missing validity windows or unmapped current/historical facts as wrong_result, not pass." + ], + "research_depth": "D2 feasibility plus XY-888 Docker temporal smoke implementation and XY-900 scored smoke promotion; checked-in record remains research_gate unless a generated artifact reaches Graphiti search output" + }, + "notes": [ + "Status class: smoke-only scored adapter path with typed live-run opt-in, provider, and setup blockers.", + "Graphiti/Zep remains the temporal-validity reference; do not claim ELF-over-Graphiti/Zep until provider-backed temporal output maps to scored evidence ids." + ], + "follow_up": { + "title": "[ELF benchmark adapter] Implement Graphiti/Zep temporal graph adapter", + "reason": "Created as XY-888. XY-882 found a Docker-local graph-store path and fact/validity-window output contract for memory_evolution scoring." + } + }, + { + "adapter_id": "letta_research_gate", + "project": "Letta", + "adapter_kind": "research_gate", + "evidence_class": "research_gate", + "docker_default": true, + "host_global_installs_required": false, + "overall_status": "blocked", + "setup": { + "status": "blocked", + "evidence": "Letta is D1 reviewed as a core/archival memory reference. The contained comparison contract now has cargo make smoke-letta-core-archive-export-readback, a Docker-only benchmark-created agent export/readback materializer that must return core block JSON, archival search/readback JSON, and source ids before any scenario claim is scored.", + "command": "cargo make smoke-letta-core-archive-export-readback", + "artifact": "tmp/real-world-memory/letta-core-archive/letta-core-archive-export.json" + }, + "run": { + "status": "blocked", + "evidence": "The default materializer emits a typed blocked report unless a Docker-local Letta server and explicit model/provider configuration produce benchmark-owned core block export and archival readback/search output.", + "command": "ELF_LETTA_SMOKE_START=1 ELF_LETTA_SMOKE_RUN=1 cargo make smoke-letta-core-archive-export-readback", + "artifact": "tmp/real-world-memory/letta-core-archive/summary.json" + }, + "result": { + "status": "blocked", + "evidence": "No Letta core block, archival fallback, stale-core, scope, provenance, or project-decision pass/win/tie/loss is claimed until the generated export/readback artifact maps required source ids.", + "artifact": "tmp/real-world-memory/letta-core-archive/report.json" + }, + "capabilities": [ + { + "capability": "core_archival_memory", + "status": "blocked", + "evidence": "ELF fixture jobs score core block attachment, scope, provenance, stale-core detection, archival fallback, and project-decision recovery separately from archival note search; Letta remains blocked until its export maps equivalent source ids." + }, + { + "capability": "docker_embedding_configuration", + "status": "blocked", + "evidence": "Official Docker setup requires explicit embedding configuration before archival retrieval can be tested." + }, + { + "capability": "real_world_job_adapter", + "status": "blocked", + "evidence": "A Docker-contained materializer now exists and emits typed blocked evidence by default; live scoring still requires exported Letta core blocks, archival list/search JSON, and source-id mappings." + }, + { + "capability": "broad_letta_quality_claim", + "status": "not_encoded", + "evidence": "The materializer does not score broad Letta product quality, hosted/private state, personalization breadth, or production durability." + } + ], + "suites": [ + { + "suite_id": "personalization", + "status": "not_encoded", + "evidence": "Core memory preference application is not encoded for Letta." + }, + { + "suite_id": "project_decisions", + "status": "blocked", + "evidence": "The project-decision recovery row is represented only through the core_archival_memory export/readback materializer and remains blocked without mapped source ids." + }, + { + "suite_id": "work_resume", + "status": "not_encoded", + "evidence": "Agent resumption through Letta memory blocks is not encoded." + }, + { + "suite_id": "core_archival_memory", + "status": "blocked", + "evidence": "A Docker-contained materializer now emits the core_archival_memory scenarios as typed blocked unless live Letta export/readback maps core block JSON, archival search/readback JSON, and source ids." + } + ], + "scenarios": [ + { + "scenario_id": "core_block_attachment_readback", + "suite_id": "core_archival_memory", + "status": "blocked", + "elf_position": "untested", + "comparison_outcome": "blocked", + "evidence": "ELF fixture core-archival-core-block-attachment-001 scores exact core block attachment and keeps core readback out of Qdrant-backed archival search. Letta remains blocked until the generated export/readback artifact maps this core block attachment source id.", + "command": "cargo make smoke-letta-core-archive-export-readback", + "artifact": "tmp/real-world-memory/letta-core-archive/summary.json" + }, + { + "scenario_id": "core_block_scope_readback", + "suite_id": "core_archival_memory", + "status": "blocked", + "elf_position": "untested", + "comparison_outcome": "blocked", + "evidence": "ELF fixture core-archival-core-block-scope-001 scores read_profile, shared scope, and private-owner boundaries. Letta scope behavior remains blocked until the generated export includes agent, block, visibility metadata, and source ids.", + "command": "cargo make smoke-letta-core-archive-export-readback", + "artifact": "tmp/real-world-memory/letta-core-archive/summary.json" + }, + { + "scenario_id": "core_block_provenance_readback", + "suite_id": "core_archival_memory", + "status": "blocked", + "elf_position": "untested", + "comparison_outcome": "blocked", + "evidence": "ELF fixture core-archival-core-block-provenance-001 scores source_ref and audit_history readback. Letta provenance remains blocked until exported core memory includes stable source ids and audit-equivalent events.", + "command": "cargo make smoke-letta-core-archive-export-readback", + "artifact": "tmp/real-world-memory/letta-core-archive/summary.json" + }, + { + "scenario_id": "stale_core_detection", + "suite_id": "core_archival_memory", + "status": "blocked", + "elf_position": "untested", + "comparison_outcome": "blocked", + "evidence": "ELF fixture core-archival-stale-core-detection-001 scores archival evidence superseding a stale core block. Letta stale-core comparison is blocked until core export and archival readback can be joined by source ids.", + "command": "cargo make smoke-letta-core-archive-export-readback", + "artifact": "tmp/real-world-memory/letta-core-archive/summary.json" + }, + { + "scenario_id": "archival_fallback_readback", + "suite_id": "core_archival_memory", + "status": "blocked", + "elf_position": "untested", + "comparison_outcome": "blocked", + "evidence": "ELF fixture core-archival-archival-fallback-001 scores fallback from insufficient core memory to archival note search. Letta fallback comparison is blocked until archival search output can be exported with source ids.", + "command": "cargo make smoke-letta-core-archive-export-readback", + "artifact": "tmp/real-world-memory/letta-core-archive/summary.json" + }, + { + "scenario_id": "core_archival_project_decision_recovery", + "suite_id": "core_archival_memory", + "status": "blocked", + "elf_position": "untested", + "comparison_outcome": "blocked", + "evidence": "ELF fixture core-archival-project-decision-recovery-001 scores core routing plus archival decision rationale. Letta project-decision recovery remains blocked until the generated export/readback artifact maps core routing plus archival rationale source ids.", + "command": "cargo make smoke-letta-core-archive-export-readback", + "artifact": "tmp/real-world-memory/letta-core-archive/summary.json" + } + ], + "evidence": [ + { + "kind": "artifact", + "ref": "tmp/real-world-memory/letta-core-archive/letta-core-archive-export.json", + "status": "blocked" + }, + { + "kind": "artifact", + "ref": "tmp/real-world-memory/letta-core-archive/summary.json", + "status": "blocked" + }, + { + "kind": "source", + "ref": "https://docs.letta.com/guides/docker", + "status": "real" + }, + { + "kind": "source", + "ref": "https://docs.letta.com/api/python", + "status": "real" + }, + { + "kind": "source", + "ref": "https://docs.letta.com/api/resources/agents/subresources/passages/methods/search", + "status": "real" + } + ], + "execution_metadata": { + "sources": [ + { + "label": "Letta Docker docs", + "url": "https://docs.letta.com/guides/docker", + "evidence": "Official Docker setup and explicit embedding configuration boundary." + }, + { + "label": "Letta Python API", + "url": "https://docs.letta.com/api/python", + "evidence": "Official Python SDK memory block creation and retrieval examples." + }, + { + "label": "Letta archival search API", + "url": "https://docs.letta.com/api/resources/agents/subresources/passages/methods/search", + "evidence": "Official archival-memory search endpoint contract." + } + ], + "setup_path": "Run cargo make smoke-letta-core-archive-export-readback for a typed artifact; set ELF_LETTA_SMOKE_START=1 ELF_LETTA_SMOKE_RUN=1 with explicit model/provider configuration for a live export attempt. The smoke exports core block JSON plus archival search/readback JSON when Letta setup succeeds.", + "runtime_boundary": "docker-compose.baseline.yml baseline-runner plus optional Letta server profile, benchmark-created agent, benchmark-owned fixture corpus, no hosted/private state, and artifacts under tmp/real-world-memory/letta-core-archive.", + "resource_expectation": "Letta Docker server, Python SDK client, explicit model and embedding configuration, exported core memory, archival search output, and provider boundaries must be explicit in the artifact.", + "retry_guidance": [ + "Default command records a typed blocked artifact without model calls.", + "Enable the live path only with Docker-local Letta and explicit provider or local model configuration.", + "Score core-versus-archival scenarios only after core block export and archival list/search output map to fixture evidence ids." + ], + "research_depth": "D1 feasibility verdict: research_only (XY-882); XY-927 selected the contained export/readback contract; XY-984 adds the Docker-contained materializer and keeps the comparison blocked until live export evidence maps source ids." + }, + "notes": [] + }, + { + "adapter_id": "langgraph_research_gate", + "project": "LangGraph", + "adapter_kind": "research_gate", + "evidence_class": "research_gate", + "docker_default": true, + "host_global_installs_required": false, + "overall_status": "not_encoded", + "setup": { + "status": "not_encoded", + "evidence": "LangGraph is D1 reviewed as a replay/checkpoint reference, not a direct memory backend adapter." + }, + "run": { + "status": "not_encoded", + "evidence": "No checkpoint replay real_world_job harness is encoded." + }, + "result": { + "status": "not_encoded", + "evidence": "No production-ops or resume suite result is claimed." + }, + "capabilities": [ + { + "capability": "checkpoint_replay_regression", + "status": "not_encoded", + "evidence": "Replay/fork behavior needs an agent graph harness before scoring." + }, + { + "capability": "standalone_memory_backend", + "status": "unsupported", + "evidence": "LangGraph persistence is an agent-state/checkpoint layer, not a drop-in memory retrieval backend." + }, + { + "capability": "real_world_job_adapter", + "status": "not_encoded", + "evidence": "No LangGraph benchmark materializer exists." + } + ], + "suites": [ + { + "suite_id": "production_ops", + "status": "not_encoded", + "evidence": "Checkpoint recovery and replay regression are not encoded." + }, + { + "suite_id": "work_resume", + "status": "not_encoded", + "evidence": "Resume from checkpoint with memory reads is not encoded." + } + ], + "scenarios": [], + "evidence": [ + { + "kind": "source", + "ref": "https://docs.langchain.com/oss/python/langgraph/persistence", + "status": "real" + } + ], + "execution_metadata": { + "sources": [ + { + "label": "LangGraph persistence docs", + "url": "https://docs.langchain.com/oss/python/langgraph/persistence", + "evidence": "Official documentation for checkpoints, replay, fork, and persistence behavior." + } + ], + "setup_path": "Build a tiny LangGraph agent with a checkpointer and explicit memory read/write steps before scoring.", + "runtime_boundary": "Docker-only Python harness with checkpoint store under the artifact directory.", + "resource_expectation": "Small runtime expected, but LLM calls and side effects must be stubbed or deterministic before replay claims.", + "retry_guidance": [ + "Encode one replay/fork failure recovery job.", + "Keep LangGraph classified as replay reference unless memory retrieval is actually exercised." + ], + "research_depth": "D1 feasibility verdict: research_only (XY-882); replay/checkpoint reference, adapter not encoded" + }, + "notes": [] + }, + { + "adapter_id": "nanograph_research_gate", + "project": "nanograph", + "adapter_kind": "research_gate", + "evidence_class": "research_gate", + "docker_default": true, + "host_global_installs_required": false, + "overall_status": "not_encoded", + "setup": { + "status": "not_encoded", + "evidence": "nanograph is D1 reviewed as typed graph DX, but no Docker adapter is implemented." + }, + "run": { + "status": "not_encoded", + "evidence": "No typed graph schema/query real_world_job run is encoded." + }, + "result": { + "status": "not_encoded", + "evidence": "No graph temporal or retrieval-debug result is claimed." + }, + "capabilities": [ + { + "capability": "typed_graph_schema", + "status": "not_encoded", + "evidence": "Schema-as-code and typed query ergonomics need a benchmark harness." + }, + { + "capability": "memory_backend_comparison", + "status": "unsupported", + "evidence": "nanograph is a graph database reference, not a complete agent memory service." + }, + { + "capability": "real_world_job_adapter", + "status": "not_encoded", + "evidence": "No nanograph materializer exists." + } + ], + "suites": [ + { + "suite_id": "memory_evolution", + "status": "not_encoded", + "evidence": "Typed current/historical fact jobs are not encoded." + }, + { + "suite_id": "retrieval", + "status": "not_encoded", + "evidence": "Typed query explainability is not scored." + } + ], + "scenarios": [], + "evidence": [ + { + "kind": "source", + "ref": "https://github.com/nanograph/nanograph", + "status": "real" + } + ], + "execution_metadata": { + "sources": [ + { + "label": "nanograph repository", + "url": "https://github.com/nanograph/nanograph", + "evidence": "Official source for on-device typed property graph behavior." + } + ], + "setup_path": "Build or install nanograph inside Docker and load a typed graph fixture from generated corpus facts.", + "runtime_boundary": "Docker-only CLI run with graph folder under benchmark artifacts.", + "resource_expectation": "Light local graph runtime expected; record binary build/install time and graph artifact size.", + "retry_guidance": [ + "Define a minimal schema for memory_evolution facts.", + "Score typed query output only if it cites fixture evidence IDs." + ], + "research_depth": "D1 feasibility verdict: research_only (XY-882); typed graph DX reference, adapter not encoded" + }, + "notes": [] + }, + { + "adapter_id": "llm_wiki_research_gate", + "project": "llm-wiki", + "adapter_kind": "research_gate", + "evidence_class": "research_gate", + "docker_default": true, + "host_global_installs_required": false, + "overall_status": "not_encoded", + "setup": { + "status": "not_encoded", + "evidence": "llm-wiki is D1 reviewed as a knowledge-compilation reference, but no plugin or generated-page adapter is implemented." + }, + "run": { + "status": "not_encoded", + "evidence": "No llm-wiki corpus-to-page run is encoded." + }, + "result": { + "status": "not_encoded", + "evidence": "No knowledge page citation or lint result is claimed." + }, + "capabilities": [ + { + "capability": "knowledge_page_compilation", + "status": "not_encoded", + "evidence": "Wiki generation and citation lint are not executed by the runner." + }, + { + "capability": "live_service_runtime", + "status": "unsupported", + "evidence": "llm-wiki is a plugin/workflow reference rather than a service adapter." + }, + { + "capability": "real_world_job_adapter", + "status": "not_encoded", + "evidence": "No page materializer or scorer mapping exists." + } + ], + "suites": [ + { + "suite_id": "knowledge_compilation", + "status": "not_encoded", + "evidence": "Corpus-to-wiki output is not encoded." + }, + { + "suite_id": "work_resume", + "status": "not_encoded", + "evidence": "Resume answers from wiki pages are not encoded." + } + ], + "scenarios": [ + { + "scenario_id": "wiki_page_citation_lint", + "suite_id": "knowledge_compilation", + "status": "not_encoded", + "elf_position": "untested", + "comparison_outcome": "not_tested", + "evidence": "llm-wiki remains a knowledge-workflow reference. No Docker-contained plugin or file-based page materializer emits cited wiki sections for scoring.", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json" + } + ], + "evidence": [ + { + "kind": "source", + "ref": "https://github.com/nvk/llm-wiki", + "status": "real" + } + ], + "execution_metadata": { + "sources": [ + { + "label": "llm-wiki repository", + "url": "https://github.com/nvk/llm-wiki", + "evidence": "Official source for the LLM Wiki plugin and knowledge-base workflow." + } + ], + "setup_path": "Research plugin bootstrap inside a Docker-contained Codex or file-based harness, then materialize page artifacts.", + "runtime_boundary": "Docker-only plugin or fixture materializer; no user-global Codex plugin install.", + "resource_expectation": "LLM generation cost depends on page build; record provider boundary and generated artifact size.", + "retry_guidance": [ + "Prototype a fixture-only page build with explicit citations.", + "Do not score until generated sections can be mapped to evidence IDs." + ], + "research_depth": "D1 feasibility verdict: research_only (XY-882); derived wiki workflow reference, adapter not encoded" + }, + "notes": [] + }, + { + "adapter_id": "gbrain_research_gate", + "project": "gbrain", + "adapter_kind": "research_gate", + "evidence_class": "research_gate", + "docker_default": true, + "host_global_installs_required": false, + "overall_status": "not_encoded", + "setup": { + "status": "not_encoded", + "evidence": "gbrain is D1 reviewed as a compiled-truth and timeline reference, but no Docker adapter is implemented." + }, + "run": { + "status": "not_encoded", + "evidence": "No gbrain brain-repo import or compiled-truth run is encoded." + }, + "result": { + "status": "not_encoded", + "evidence": "No knowledge-synthesis or operator-continuity result is claimed." + }, + "capabilities": [ + { + "capability": "compiled_truth_timeline", + "status": "not_encoded", + "evidence": "Compiled truth plus timeline output is a reference pattern but not scored." + }, + { + "capability": "postgres_backed_brain_repo", + "status": "blocked", + "evidence": "A Docker-local brain repo and Postgres setup path must be proven before execution." + }, + { + "capability": "real_world_job_adapter", + "status": "not_encoded", + "evidence": "No gbrain materializer exists." + } + ], + "suites": [ + { + "suite_id": "knowledge_compilation", + "status": "not_encoded", + "evidence": "Compiled truth and timeline pages are not scored." + }, + { + "suite_id": "operator_debugging_ux", + "status": "not_encoded", + "evidence": "Operator continuity through brain pages is not encoded." + } + ], + "scenarios": [ + { + "scenario_id": "compiled_truth_timeline_export", + "suite_id": "knowledge_compilation", + "status": "blocked", + "elf_position": "untested", + "comparison_outcome": "blocked", + "evidence": "gbrain compiled-truth and timeline scoring remains blocked until a Docker-local brain repository and database setup emits current-truth pages with source timeline evidence.", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json" + } + ], + "evidence": [ + { + "kind": "source", + "ref": "https://github.com/garrytan/gbrain", + "status": "real" + }, + { + "kind": "source", + "ref": "https://github.com/garrytan/gbrain/blob/master/docs/guides/compiled-truth.md", + "status": "real" + } + ], + "execution_metadata": { + "sources": [ + { + "label": "gbrain repository", + "url": "https://github.com/garrytan/gbrain", + "evidence": "Official source for brain repo and retrieval workflow." + }, + { + "label": "compiled truth guide", + "url": "https://github.com/garrytan/gbrain/blob/master/docs/guides/compiled-truth.md", + "evidence": "Official guide for compiled truth plus timeline behavior." + } + ], + "setup_path": "Create a Docker-local brain repo fixture, run import/sync, and export compiled truth plus timeline evidence.", + "runtime_boundary": "Docker-only repository and database state with no operator-owned brain repo.", + "resource_expectation": "Postgres-backed sync and embedding choices must be explicit; record DB size and import time.", + "retry_guidance": [ + "Prototype a tiny brain repo with one current-truth page and timeline.", + "Score only if compiled truth cites the source timeline evidence." + ], + "research_depth": "D1 feasibility verdict: blocked (XY-882); Docker-local brain repo and database path not proven" + }, + "notes": [] + }, + { + "adapter_id": "graphify_docker_smoke", + "project": "graphify", + "adapter_kind": "docker_cli_real_world_job", + "evidence_class": "live_real_world", + "docker_default": true, + "host_global_installs_required": false, + "overall_status": "wrong_result", + "setup": { + "status": "pass", + "evidence": "XY-900 validation reached the Docker-only graph/report smoke setup inside the baseline runner without host-global assistant hooks.", + "command": "cargo make smoke-graphify-docker-graph-report", + "artifact": "tmp/real-world-memory/graphify-smoke/graphify-smoke.json" + }, + "run": { + "status": "pass", + "evidence": "The smoke installed graphify in a container-local venv, ran over a generated public corpus, and produced graph/report/query output for scoring.", + "command": "cargo make smoke-graphify-docker-graph-report", + "artifact": "tmp/real-world-memory/graphify-smoke/summary.json" + }, + "result": { + "status": "wrong_result", + "evidence": "The smoke emits graphify-report.json and graphify-report.md from one generated knowledge_compilation job. The current scored report maps evidence ids but remains wrong_result because the scoring rubric still records a wrong-result signal.", + "artifact": "tmp/real-world-memory/graphify-smoke/graphify-report.json" + }, + "capabilities": [ + { + "capability": "docker_cli_boundary", + "status": "pass", + "evidence": "The smoke uses docker-compose.baseline.yml baseline-runner, a container-local Python venv, and isolated assistant config paths; it does not install host-global assistant hooks." + }, + { + "capability": "graph_report_generation", + "status": "pass", + "evidence": "The smoke captures graphify-out/graph.json, GRAPH_REPORT.md, cache metadata, command logs, build time, graph size, and report size." + }, + { + "capability": "real_world_job_adapter", + "status": "wrong_result", + "evidence": "The smoke writes a generated real_world_job fixture and scored report; current knowledge_compilation scoring is wrong_result, not pass." + }, + { + "capability": "multimodal_code_graph", + "status": "not_encoded", + "evidence": "Multimodal extraction for videos, images, PDFs, or broad codebase understanding is a reference capability but not scored by this smoke." + }, + { + "capability": "quality_or_scale_claim", + "status": "not_encoded", + "evidence": "The smoke does not claim broad graph quality, private corpus behavior, scale, or authoritative memory-store behavior." + } + ], + "suites": [ + { + "suite_id": "knowledge_compilation", + "status": "wrong_result", + "evidence": "The generated smoke exercised graph/report evidence mapping for one generated knowledge-compilation fixture and scored wrong_result with mean_score 0.75." + }, + { + "suite_id": "retrieval", + "status": "blocked", + "evidence": "Graph-guided query output is present only as support for the generated knowledge_compilation smoke; broad retrieval quality scoring remains unclaimed." + }, + { + "suite_id": "work_resume", + "status": "not_encoded", + "evidence": "Resume answers from graph context are not encoded." + } + ], + "scenarios": [ + { + "scenario_id": "graph_report_navigation_lint", + "suite_id": "knowledge_compilation", + "status": "wrong_result", + "elf_position": "untested", + "comparison_outcome": "not_tested", + "evidence": "XY-929 adds a representative graphify fixture that scores graph report navigation, source-location citations, stale-source lint, and unsupported-summary handling as wrong_result because stale-source lint is still missing. This remains graphify non-pass evidence, not an ELF victory claim.", + "command": "cargo make real-world-memory-graph-rag", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/graph_rag/graphify_graph_report_wrong_result.json" + }, + { + "scenario_id": "broad_graph_navigation_quality", + "suite_id": "retrieval", + "status": "not_encoded", + "elf_position": "untested", + "comparison_outcome": "not_tested", + "evidence": "Broad graph-navigation, codebase, multimodal, and private-corpus quality remain not_tested; the graphify evidence is bounded to generated graph/report artifacts.", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json" + } + ], + "evidence": [ + { + "kind": "source", + "ref": "https://github.com/safishamsi/graphify", + "status": "real" + }, + { + "kind": "command", + "ref": "cargo make smoke-graphify-docker-graph-report", + "status": "wrong_result" + }, + { + "kind": "artifact", + "ref": "tmp/real-world-memory/graphify-smoke/graphify-smoke.json", + "status": "pass" + }, + { + "kind": "artifact", + "ref": "tmp/real-world-memory/graphify-smoke/graphify-report.md", + "status": "wrong_result" + } + ], + "execution_metadata": { + "sources": [ + { + "label": "graphify repository", + "url": "https://github.com/safishamsi/graphify", + "evidence": "Official source for graphify graph extraction and query workflow." + }, + { + "label": "graphify README", + "url": "https://github.com/safishamsi/graphify/blob/v3/README.md", + "evidence": "Official CLI, output artifact, query, and source-location contract." + } + ], + "setup_path": "Run cargo make smoke-graphify-docker-graph-report to install graphify in Docker, build graph/report artifacts from a generated public corpus, and export query evidence without installing host-global assistant hooks.", + "runtime_boundary": "docker-compose.baseline.yml baseline-runner, container-local Python venv, isolated HOME/config paths, generated public corpus, and artifacts under tmp/real-world-memory/graphify-smoke.", + "resource_expectation": "Graph build cost scales with corpus and model choices; generated artifacts record package reference, provider/model boundary, build time, graph size, report size, cache size, timeout, and retry behavior.", + "retry_guidance": [ + "Run cargo make smoke-graphify-docker-graph-report first; setup/runtime failures must remain typed artifacts, not pass claims.", + "Do not use graphify host assistant hook installs or operator-owned assistant configuration as proof.", + "Score graph-guided answers only when graph.json, GRAPH_REPORT.md, and graphify query output map to generated evidence ids." + ], + "research_depth": "D1 feasibility verdict plus XY-889 Docker graph/report smoke implementation and XY-900 scored smoke promotion; current Docker validation reaches graphify output and scores the tiny knowledge_compilation job as wrong_result" + }, + "notes": [ + "Status class: live Docker scored smoke with a current wrong_result outcome.", + "Do not interpret graphify-report.json as broad graph-navigation or knowledge-compilation quality evidence; the tiny smoke is scored and currently non-pass." + ], + "follow_up": { + "title": "[ELF benchmark adapter] Implement graphify Docker graph-report adapter", + "reason": "Created as XY-889. XY-882 found a Docker-only CLI/materializer path and source-file/source-location output contract." + } + } + ] + }, + "capture_integration": { + "real": [], + "fixture_backed": [], + "mocked": [], + "blocked": [], + "not_encoded": [ + "No capture/integration behavior was declared by encoded fixtures." + ], + "notes": [] + }, + "summary": { + "job_count": 7, + "encoded_suite_count": 1, + "pass": 5, + "wrong_result": 0, + "lifecycle_fail": 0, + "incomplete": 0, + "blocked": 2, + "not_encoded": 0, + "unsupported_claim": 0, + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 0, + "update_rationale_available_count": 0, + "temporal_validity_not_encoded_count": 0, + "history_readback_encoded_count": 0, + "expected_evidence_total": 18, + "expected_evidence_matched": 18, + "expected_evidence_recall": 1.0, + "irrelevant_context_count": 0, + "irrelevant_context_ratio": 0.0, + "trace_explainability_count": 0, + "wrong_result_stage_attribution_count": 0, + "mean_score": 0.714, + "mean_latency_ms": 3.192, + "total_cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 386, + "output_tokens": 0 + }, + "evidence_required_count": 18, + "evidence_covered_count": 18, + "evidence_coverage": 1.0, + "source_ref_required_count": 18, + "source_ref_covered_count": 18, + "source_ref_coverage": 1.0, + "quote_required_count": 18, + "quote_covered_count": 18, + "quote_coverage": 1.0, + "stale_retrieval_count": 0, + "scope_check_count": 0, + "scope_correct_count": 0, + "scope_correctness": 0.0, + "scope_violation_count": 0, + "redaction_leak_count": 0, + "qdrant_rebuild_case_count": 1, + "qdrant_rebuild_pass_count": 1, + "operator_debug_job_count": 0, + "raw_sql_needed_count": 0, + "trace_incomplete_count": 0, + "operator_ux_gap_count": 0, + "consolidation": { + "proposal_count": 0, + "proposal_usefulness": null, + "lineage_completeness": null, + "review_action_correctness": null, + "source_mutation_count": 0, + "proposal_unsupported_claim_count": 0, + "executable_gap_count": 0 + } + }, + "suites": [ + { + "suite_id": "trust_source_of_truth", + "status": "not_encoded", + "encoded_job_count": 0, + "score_mean": null, + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 0, + "update_rationale_available_count": 0, + "temporal_validity_not_encoded_count": 0, + "history_readback_encoded_count": 0, + "expected_evidence_recall": null, + "irrelevant_context_ratio": null, + "trace_explainability_count": 0, + "reason": "No checked-in real_world_job fixture is encoded for this suite." + }, + { + "suite_id": "work_resume", + "status": "not_encoded", + "encoded_job_count": 0, + "score_mean": null, + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 0, + "update_rationale_available_count": 0, + "temporal_validity_not_encoded_count": 0, + "history_readback_encoded_count": 0, + "expected_evidence_recall": null, + "irrelevant_context_ratio": null, + "trace_explainability_count": 0, + "reason": "No checked-in real_world_job fixture is encoded for this suite." + }, + { + "suite_id": "project_decisions", + "status": "not_encoded", + "encoded_job_count": 0, + "score_mean": null, + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 0, + "update_rationale_available_count": 0, + "temporal_validity_not_encoded_count": 0, + "history_readback_encoded_count": 0, + "expected_evidence_recall": null, + "irrelevant_context_ratio": null, + "trace_explainability_count": 0, + "reason": "No checked-in real_world_job fixture is encoded for this suite." + }, + { + "suite_id": "retrieval", + "status": "not_encoded", + "encoded_job_count": 0, + "score_mean": null, + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 0, + "update_rationale_available_count": 0, + "temporal_validity_not_encoded_count": 0, + "history_readback_encoded_count": 0, + "expected_evidence_recall": null, + "irrelevant_context_ratio": null, + "trace_explainability_count": 0, + "reason": "No checked-in real_world_job fixture is encoded for this suite." + }, + { + "suite_id": "memory_evolution", + "status": "not_encoded", + "encoded_job_count": 0, + "score_mean": null, + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 0, + "update_rationale_available_count": 0, + "temporal_validity_not_encoded_count": 0, + "history_readback_encoded_count": 0, + "expected_evidence_recall": null, + "irrelevant_context_ratio": null, + "trace_explainability_count": 0, + "reason": "No checked-in real_world_job fixture is encoded for this suite." + }, + { + "suite_id": "adversarial_quality", + "status": "not_encoded", + "encoded_job_count": 0, + "score_mean": null, + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 0, + "update_rationale_available_count": 0, + "temporal_validity_not_encoded_count": 0, + "history_readback_encoded_count": 0, + "expected_evidence_recall": null, + "irrelevant_context_ratio": null, + "trace_explainability_count": 0, + "reason": "No checked-in real_world_job fixture is encoded for this suite." + }, + { + "suite_id": "consolidation", + "status": "not_encoded", + "encoded_job_count": 0, + "score_mean": null, + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 0, + "update_rationale_available_count": 0, + "temporal_validity_not_encoded_count": 0, + "history_readback_encoded_count": 0, + "expected_evidence_recall": null, + "irrelevant_context_ratio": null, + "trace_explainability_count": 0, + "reason": "No checked-in real_world_job fixture is encoded for this suite." + }, + { + "suite_id": "memory_summary", + "status": "not_encoded", + "encoded_job_count": 0, + "score_mean": null, + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 0, + "update_rationale_available_count": 0, + "temporal_validity_not_encoded_count": 0, + "history_readback_encoded_count": 0, + "expected_evidence_recall": null, + "irrelevant_context_ratio": null, + "trace_explainability_count": 0, + "reason": "No checked-in real_world_job fixture is encoded for this suite." + }, + { + "suite_id": "proactive_brief", + "status": "not_encoded", + "encoded_job_count": 0, + "score_mean": null, + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 0, + "update_rationale_available_count": 0, + "temporal_validity_not_encoded_count": 0, + "history_readback_encoded_count": 0, + "expected_evidence_recall": null, + "irrelevant_context_ratio": null, + "trace_explainability_count": 0, + "reason": "No checked-in real_world_job fixture is encoded for this suite." + }, + { + "suite_id": "scheduled_memory", + "status": "not_encoded", + "encoded_job_count": 0, + "score_mean": null, + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 0, + "update_rationale_available_count": 0, + "temporal_validity_not_encoded_count": 0, + "history_readback_encoded_count": 0, + "expected_evidence_recall": null, + "irrelevant_context_ratio": null, + "trace_explainability_count": 0, + "reason": "No checked-in real_world_job fixture is encoded for this suite." + }, + { + "suite_id": "knowledge_compilation", + "status": "not_encoded", + "encoded_job_count": 0, + "score_mean": null, + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 0, + "update_rationale_available_count": 0, + "temporal_validity_not_encoded_count": 0, + "history_readback_encoded_count": 0, + "expected_evidence_recall": null, + "irrelevant_context_ratio": null, + "trace_explainability_count": 0, + "reason": "No checked-in real_world_job fixture is encoded for this suite." + }, + { + "suite_id": "source_library", + "status": "not_encoded", + "encoded_job_count": 0, + "score_mean": null, + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 0, + "update_rationale_available_count": 0, + "temporal_validity_not_encoded_count": 0, + "history_readback_encoded_count": 0, + "expected_evidence_recall": null, + "irrelevant_context_ratio": null, + "trace_explainability_count": 0, + "reason": "No checked-in real_world_job fixture is encoded for this suite." + }, + { + "suite_id": "operator_debugging_ux", + "status": "not_encoded", + "encoded_job_count": 0, + "score_mean": null, + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 0, + "update_rationale_available_count": 0, + "temporal_validity_not_encoded_count": 0, + "history_readback_encoded_count": 0, + "expected_evidence_recall": null, + "irrelevant_context_ratio": null, + "trace_explainability_count": 0, + "reason": "No checked-in real_world_job fixture is encoded for this suite." + }, + { + "suite_id": "capture_integration", + "status": "not_encoded", + "encoded_job_count": 0, + "score_mean": null, + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 0, + "update_rationale_available_count": 0, + "temporal_validity_not_encoded_count": 0, + "history_readback_encoded_count": 0, + "expected_evidence_recall": null, + "irrelevant_context_ratio": null, + "trace_explainability_count": 0, + "reason": "No checked-in real_world_job fixture is encoded for this suite." + }, + { + "suite_id": "production_ops", + "status": "blocked", + "encoded_job_count": 7, + "score_mean": 0.714, + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 0, + "update_rationale_available_count": 0, + "temporal_validity_not_encoded_count": 0, + "history_readback_encoded_count": 0, + "expected_evidence_recall": 1.0, + "irrelevant_context_ratio": 0.0, + "trace_explainability_count": 0, + "reason": "At least one encoded job is blocked." + }, + { + "suite_id": "personalization", + "status": "not_encoded", + "encoded_job_count": 0, + "score_mean": null, + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 0, + "update_rationale_available_count": 0, + "temporal_validity_not_encoded_count": 0, + "history_readback_encoded_count": 0, + "expected_evidence_recall": null, + "irrelevant_context_ratio": null, + "trace_explainability_count": 0, + "reason": "No checked-in real_world_job fixture is encoded for this suite." + }, + { + "suite_id": "core_archival_memory", + "status": "not_encoded", + "encoded_job_count": 0, + "score_mean": null, + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 0, + "update_rationale_available_count": 0, + "temporal_validity_not_encoded_count": 0, + "history_readback_encoded_count": 0, + "expected_evidence_recall": null, + "irrelevant_context_ratio": null, + "trace_explainability_count": 0, + "reason": "No checked-in real_world_job fixture is encoded for this suite." + }, + { + "suite_id": "context_trajectory", + "status": "not_encoded", + "encoded_job_count": 0, + "score_mean": null, + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 0, + "update_rationale_available_count": 0, + "temporal_validity_not_encoded_count": 0, + "history_readback_encoded_count": 0, + "expected_evidence_recall": null, + "irrelevant_context_ratio": null, + "trace_explainability_count": 0, + "reason": "No checked-in real_world_job fixture is encoded for this suite." + } + ], + "jobs": [ + { + "suite_id": "production_ops", + "job_id": "production-ops-restore-cold-start-001", + "title": "Read back restored memory after Docker cold start and Qdrant rebuild", + "status": "pass", + "operational_evidence_tier": "local_fixture", + "answer_type": "direct_answer", + "requires_caveat": false, + "requires_refusal": false, + "can_answer_unknown": false, + "normalized_score": 1.0, + "hard_fail_hits": [], + "expected_evidence": [ + { + "evidence_id": "restore-search-before", + "claim_id": "restore_recovered_key", + "requirement": "cite" + }, + { + "evidence_id": "restore-qdrant-rebuild", + "claim_id": "qdrant_rebuild_counts", + "requirement": "cite" + }, + { + "evidence_id": "restore-search-after", + "claim_id": "cold_start_readback", + "requirement": "cite" + } + ], + "produced_answer": "The restore proof recovered key single_user_restore_probe after a Docker cold start. Qdrant rebuild returned rebuilt_count=1, missing_vector_count=0, error_count=0, and search after cold start returned one result for the restored key.", + "produced_evidence": [ + "restore-qdrant-rebuild", + "restore-search-after", + "restore-search-before" + ], + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 0, + "update_rationale_available": false, + "temporal_validity_not_encoded": false, + "history_readback_encoded": false, + "retrieval_quality": { + "expected_evidence_total": 3, + "expected_evidence_matched": 3, + "expected_evidence_recall": 1.0, + "produced_evidence_total": 3, + "irrelevant_context_count": 0, + "irrelevant_context_ratio": 0.0, + "trap_context_count": 0 + }, + "latency_ms": 2.1, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + }, + "trace_explainability": null, + "trap_ids_used": [], + "dimension_scores": [ + { + "dimension": "evidence_grounding", + "score": 1.0, + "max_points": 1.0, + "weight": 0.3 + }, + { + "dimension": "lifecycle_behavior", + "score": 1.0, + "max_points": 1.0, + "weight": 0.35 + }, + { + "dimension": "trap_avoidance", + "score": 1.0, + "max_points": 1.0, + "weight": 0.2 + }, + { + "dimension": "workflow_helpfulness", + "score": 1.0, + "max_points": 1.0, + "weight": 0.15 + } + ], + "reason": "Job passed with normalized_score 1.000.", + "evidence_required_count": 3, + "evidence_covered_count": 3, + "source_ref_required_count": 3, + "source_ref_covered_count": 3, + "quote_required_count": 3, + "quote_covered_count": 3, + "stale_retrieval_count": 0, + "scope_check_count": 0, + "scope_correct_count": 0, + "scope_violation_count": 0, + "redaction_leak_count": 0, + "qdrant_rebuild_case": true + }, + { + "suite_id": "production_ops", + "job_id": "production-ops-cold-start-dependency-001", + "title": "Report pinned OpenViking cold-start path reaching behavioral wrong-result", + "status": "pass", + "operational_evidence_tier": "local_fixture", + "answer_type": "direct_answer", + "requires_caveat": false, + "requires_refusal": false, + "can_answer_unknown": true, + "normalized_score": 1.0, + "hard_fail_hits": [], + "expected_evidence": [ + { + "evidence_id": "pinned-local-embed-runtime-reached", + "claim_id": "pinned_openviking_runtime_reached", + "requirement": "cite" + }, + { + "evidence_id": "pinned-local-embed-retry", + "claim_id": "pinned_openviking_runtime_reached", + "requirement": "cite" + }, + { + "evidence_id": "openviking-wrong-result-behavior", + "claim_id": "openviking_wrong_result_after_runtime", + "requirement": "cite" + }, + { + "evidence_id": "typed-incomplete-policy", + "claim_id": "setup_failure_stays_incomplete", + "requirement": "cite" + } + ], + "produced_answer": "The pinned OpenViking Docker local embedding path reached add_resource/find. OpenViking now reports wrong_result/retrieval_wrong_result because all three smoke queries missed expected evidence terms. If the pinned llama-cpp-python install or import fails on another Docker platform, classify that setup boundary as incomplete, not pass.", + "produced_evidence": [ + "openviking-wrong-result-behavior", + "pinned-local-embed-retry", + "pinned-local-embed-runtime-reached", + "typed-incomplete-policy" + ], + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 0, + "update_rationale_available": false, + "temporal_validity_not_encoded": false, + "history_readback_encoded": false, + "retrieval_quality": { + "expected_evidence_total": 4, + "expected_evidence_matched": 4, + "expected_evidence_recall": 1.0, + "produced_evidence_total": 4, + "irrelevant_context_count": 0, + "irrelevant_context_ratio": 0.0, + "trap_context_count": 0 + }, + "latency_ms": 1.8, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + }, + "trace_explainability": null, + "trap_ids_used": [], + "dimension_scores": [ + { + "dimension": "evidence_grounding", + "score": 1.0, + "max_points": 1.0, + "weight": 0.3 + }, + { + "dimension": "lifecycle_behavior", + "score": 1.0, + "max_points": 1.0, + "weight": 0.35 + }, + { + "dimension": "trap_avoidance", + "score": 1.0, + "max_points": 1.0, + "weight": 0.15 + }, + { + "dimension": "uncertainty_handling", + "score": 1.0, + "max_points": 1.0, + "weight": 0.2 + } + ], + "reason": "Job passed with normalized_score 1.000.", + "evidence_required_count": 4, + "evidence_covered_count": 4, + "source_ref_required_count": 4, + "source_ref_covered_count": 4, + "quote_required_count": 4, + "quote_covered_count": 4, + "stale_retrieval_count": 0, + "scope_check_count": 0, + "scope_correct_count": 0, + "scope_violation_count": 0, + "redaction_leak_count": 0, + "qdrant_rebuild_case": false + }, + { + "suite_id": "production_ops", + "job_id": "production-ops-credential-boundary-001", + "title": "Keep provider credential requirement blocked without committing secrets", + "status": "blocked", + "operational_evidence_tier": "provider_backed", + "answer_type": "direct_answer", + "requires_caveat": true, + "requires_refusal": false, + "can_answer_unknown": true, + "normalized_score": 0.0, + "hard_fail_hits": [], + "expected_evidence": [ + { + "evidence_id": "provider-credential-boundary", + "claim_id": "provider_credentials_blocked", + "requirement": "cite" + }, + { + "evidence_id": "checked-in-secret-boundary", + "claim_id": "no_checked_in_secrets", + "requirement": "cite" + } + ], + "produced_answer": "Do not commit or require provider secrets in checked-in fixtures. Provider-backed production-ops checks are blocked until operator credentials are supplied, and reports may record provider metadata but never the API key.", + "produced_evidence": [ + "checked-in-secret-boundary", + "provider-credential-boundary" + ], + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 0, + "update_rationale_available": false, + "temporal_validity_not_encoded": false, + "history_readback_encoded": false, + "retrieval_quality": { + "expected_evidence_total": 2, + "expected_evidence_matched": 2, + "expected_evidence_recall": 1.0, + "produced_evidence_total": 2, + "irrelevant_context_count": 0, + "irrelevant_context_ratio": 0.0, + "trap_context_count": 0 + }, + "latency_ms": 1.7, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + }, + "trace_explainability": null, + "trap_ids_used": [], + "dimension_scores": [ + { + "dimension": "evidence_grounding", + "score": 0.0, + "max_points": 1.0, + "weight": 0.25 + }, + { + "dimension": "trap_avoidance", + "score": 0.0, + "max_points": 1.0, + "weight": 0.25 + }, + { + "dimension": "uncertainty_handling", + "score": 0.0, + "max_points": 1.0, + "weight": 0.3 + }, + { + "dimension": "workflow_helpfulness", + "score": 0.0, + "max_points": 1.0, + "weight": 0.2 + } + ], + "reason": "Provider-backed production operations require operator-owned credentials; checked-in fixtures must not include or require secrets.", + "evidence_required_count": 2, + "evidence_covered_count": 2, + "source_ref_required_count": 2, + "source_ref_covered_count": 2, + "quote_required_count": 2, + "quote_covered_count": 2, + "stale_retrieval_count": 0, + "scope_check_count": 0, + "scope_correct_count": 0, + "scope_violation_count": 0, + "redaction_leak_count": 0, + "qdrant_rebuild_case": false + }, + { + "suite_id": "production_ops", + "job_id": "production-ops-backfill-resume-001", + "title": "Resume interrupted generated backfill from checkpoint without duplicate source notes", + "status": "pass", + "operational_evidence_tier": "local_fixture", + "answer_type": "direct_answer", + "requires_caveat": false, + "requires_refusal": false, + "can_answer_unknown": false, + "normalized_score": 1.0, + "hard_fail_hits": [], + "expected_evidence": [ + { + "evidence_id": "backfill-checkpoint-state", + "claim_id": "resume_checkpoint", + "requirement": "cite" + }, + { + "evidence_id": "backfill-clean-compare", + "claim_id": "clean_compare_matched", + "requirement": "cite" + } + ], + "produced_answer": "Resume from checkpoint offset 1000 to 2000 completed the 2000 document backfill. The resumed backfill found zero duplicate source notes, and search quality after resume matched the clean run for all 16 queries.", + "produced_evidence": [ + "backfill-checkpoint-state", + "backfill-clean-compare" + ], + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 0, + "update_rationale_available": false, + "temporal_validity_not_encoded": false, + "history_readback_encoded": false, + "retrieval_quality": { + "expected_evidence_total": 2, + "expected_evidence_matched": 2, + "expected_evidence_recall": 1.0, + "produced_evidence_total": 2, + "irrelevant_context_count": 0, + "irrelevant_context_ratio": 0.0, + "trap_context_count": 0 + }, + "latency_ms": 2.0, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + }, + "trace_explainability": null, + "trap_ids_used": [], + "dimension_scores": [ + { + "dimension": "evidence_grounding", + "score": 1.0, + "max_points": 1.0, + "weight": 0.3 + }, + { + "dimension": "lifecycle_behavior", + "score": 1.0, + "max_points": 1.0, + "weight": 0.35 + }, + { + "dimension": "trap_avoidance", + "score": 1.0, + "max_points": 1.0, + "weight": 0.2 + }, + { + "dimension": "workflow_helpfulness", + "score": 1.0, + "max_points": 1.0, + "weight": 0.15 + } + ], + "reason": "Job passed with normalized_score 1.000.", + "evidence_required_count": 2, + "evidence_covered_count": 2, + "source_ref_required_count": 2, + "source_ref_covered_count": 2, + "quote_required_count": 2, + "quote_covered_count": 2, + "stale_retrieval_count": 0, + "scope_check_count": 0, + "scope_correct_count": 0, + "scope_violation_count": 0, + "redaction_leak_count": 0, + "qdrant_rebuild_case": false + }, + { + "suite_id": "production_ops", + "job_id": "production-ops-private-manifest-blocked-001", + "title": "Report missing private production manifest as bounded blocked caveat", + "status": "blocked", + "operational_evidence_tier": "private_corpus", + "answer_type": "direct_answer", + "requires_caveat": true, + "requires_refusal": false, + "can_answer_unknown": true, + "normalized_score": 0.0, + "hard_fail_hits": [], + "expected_evidence": [ + { + "evidence_id": "private-manifest-guard", + "claim_id": "private_manifest_blocked", + "requirement": "cite" + }, + { + "evidence_id": "private-bounded-failure-policy", + "claim_id": "private_bounded_failure", + "requirement": "cite" + } + ], + "produced_answer": "No private-corpus pass is claimed. The private production corpus path is blocked until an operator supplies a sanitized manifest, and the current evidence is a bounded failure, not a pass.", + "produced_evidence": [ + "private-bounded-failure-policy", + "private-manifest-guard" + ], + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 0, + "update_rationale_available": false, + "temporal_validity_not_encoded": false, + "history_readback_encoded": false, + "retrieval_quality": { + "expected_evidence_total": 2, + "expected_evidence_matched": 2, + "expected_evidence_recall": 1.0, + "produced_evidence_total": 2, + "irrelevant_context_count": 0, + "irrelevant_context_ratio": 0.0, + "trap_context_count": 0 + }, + "latency_ms": 1.6, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + }, + "trace_explainability": null, + "trap_ids_used": [], + "dimension_scores": [ + { + "dimension": "evidence_grounding", + "score": 0.0, + "max_points": 1.0, + "weight": 0.3 + }, + { + "dimension": "trap_avoidance", + "score": 0.0, + "max_points": 1.0, + "weight": 0.2 + }, + { + "dimension": "uncertainty_handling", + "score": 0.0, + "max_points": 1.0, + "weight": 0.35 + }, + { + "dimension": "workflow_helpfulness", + "score": 0.0, + "max_points": 1.0, + "weight": 0.15 + } + ], + "reason": "No operator-owned private production corpus manifest is checked in or available to this fixture; no private-corpus pass can be claimed.", + "evidence_required_count": 2, + "evidence_covered_count": 2, + "source_ref_required_count": 2, + "source_ref_covered_count": 2, + "quote_required_count": 2, + "quote_covered_count": 2, + "stale_retrieval_count": 0, + "scope_check_count": 0, + "scope_correct_count": 0, + "scope_violation_count": 0, + "redaction_leak_count": 0, + "qdrant_rebuild_case": false + }, + { + "suite_id": "production_ops", + "job_id": "production-ops-public-proxy-addendum-001", + "title": "Separate operator-approved public-proxy evidence from private and provider proof", + "status": "pass", + "operational_evidence_tier": "public_proxy", + "answer_type": "direct_answer", + "requires_caveat": true, + "requires_refusal": false, + "can_answer_unknown": false, + "normalized_score": 1.0, + "hard_fail_hits": [], + "expected_evidence": [ + { + "evidence_id": "public-proxy-addendum-pass", + "claim_id": "public_proxy_passed", + "requirement": "cite" + }, + { + "evidence_id": "public-proxy-latency-resource-cost", + "claim_id": "public_proxy_operational_envelope", + "requirement": "cite" + }, + { + "evidence_id": "public-proxy-claim-boundary", + "claim_id": "public_proxy_boundary", + "requirement": "cite" + } + ], + "produced_answer": "The operator-approved public-proxy addendum passed 8/8 query checks with 0 wrong_result. It recorded query mean latency 10.842727625 ms, P95 30.443385 ms, elapsed 1.313984156 seconds, RSS 37656 KB, and 386 estimated input tokens. This is public-proxy evidence only: it is not real private-corpus production proof and does not prove provider-backed production quality because embedding mode was local-hash.", + "produced_evidence": [ + "public-proxy-addendum-pass", + "public-proxy-claim-boundary", + "public-proxy-latency-resource-cost" + ], + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 0, + "update_rationale_available": false, + "temporal_validity_not_encoded": false, + "history_readback_encoded": false, + "retrieval_quality": { + "expected_evidence_total": 3, + "expected_evidence_matched": 3, + "expected_evidence_recall": 1.0, + "produced_evidence_total": 3, + "irrelevant_context_count": 0, + "irrelevant_context_ratio": 0.0, + "trap_context_count": 0 + }, + "latency_ms": 10.842727625, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 386, + "output_tokens": 0 + }, + "trace_explainability": null, + "trap_ids_used": [], + "dimension_scores": [ + { + "dimension": "evidence_grounding", + "score": 1.0, + "max_points": 1.0, + "weight": 0.3 + }, + { + "dimension": "latency_resource", + "score": 1.0, + "max_points": 1.0, + "weight": 0.3 + }, + { + "dimension": "trap_avoidance", + "score": 1.0, + "max_points": 1.0, + "weight": 0.25 + }, + { + "dimension": "workflow_helpfulness", + "score": 1.0, + "max_points": 1.0, + "weight": 0.15 + } + ], + "reason": "Job passed with normalized_score 1.000.", + "evidence_required_count": 3, + "evidence_covered_count": 3, + "source_ref_required_count": 3, + "source_ref_covered_count": 3, + "quote_required_count": 3, + "quote_covered_count": 3, + "stale_retrieval_count": 0, + "scope_check_count": 0, + "scope_correct_count": 0, + "scope_violation_count": 0, + "redaction_leak_count": 0, + "qdrant_rebuild_case": false + }, + { + "suite_id": "production_ops", + "job_id": "production-ops-resource-envelope-001", + "title": "Report generated backfill resource envelope and operator planning caveat", + "status": "pass", + "operational_evidence_tier": "local_fixture", + "answer_type": "direct_answer", + "requires_caveat": false, + "requires_refusal": false, + "can_answer_unknown": false, + "normalized_score": 1.0, + "hard_fail_hits": [], + "expected_evidence": [ + { + "evidence_id": "resource-envelope-check", + "claim_id": "resource_envelope_passed", + "requirement": "cite" + }, + { + "evidence_id": "large-import-planning-caveat", + "claim_id": "large_import_batch_caveat", + "requirement": "cite" + } + ], + "produced_answer": "The resource envelope passed: 2793.629 seconds was within the 3600-second limit, and 167652 KB RSS was within the 1500000 KB limit. Large imports should be planned as batch jobs, not interactive operations.", + "produced_evidence": [ + "large-import-planning-caveat", + "resource-envelope-check" + ], + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 0, + "update_rationale_available": false, + "temporal_validity_not_encoded": false, + "history_readback_encoded": false, + "retrieval_quality": { + "expected_evidence_total": 2, + "expected_evidence_matched": 2, + "expected_evidence_recall": 1.0, + "produced_evidence_total": 2, + "irrelevant_context_count": 0, + "irrelevant_context_ratio": 0.0, + "trap_context_count": 0 + }, + "latency_ms": 2.3, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + }, + "trace_explainability": null, + "trap_ids_used": [], + "dimension_scores": [ + { + "dimension": "evidence_grounding", + "score": 1.0, + "max_points": 1.0, + "weight": 0.3 + }, + { + "dimension": "latency_resource", + "score": 1.0, + "max_points": 1.0, + "weight": 0.35 + }, + { + "dimension": "trap_avoidance", + "score": 1.0, + "max_points": 1.0, + "weight": 0.2 + }, + { + "dimension": "workflow_helpfulness", + "score": 1.0, + "max_points": 1.0, + "weight": 0.15 + } + ], + "reason": "Job passed with normalized_score 1.000.", + "evidence_required_count": 2, + "evidence_covered_count": 2, + "source_ref_required_count": 2, + "source_ref_covered_count": 2, + "quote_required_count": 2, + "quote_covered_count": 2, + "stale_retrieval_count": 0, + "scope_check_count": 0, + "scope_correct_count": 0, + "scope_violation_count": 0, + "redaction_leak_count": 0, + "qdrant_rebuild_case": false + } + ], + "unsupported_claims": [], + "not_encoded_suites": [ + "trust_source_of_truth", + "work_resume", + "project_decisions", + "retrieval", + "memory_evolution", + "adversarial_quality", + "consolidation", + "memory_summary", + "proactive_brief", + "scheduled_memory", + "knowledge_compilation", + "source_library", + "operator_debugging_ux", + "capture_integration", + "personalization", + "core_archival_memory", + "context_trajectory" + ], + "private_corpus_redaction": { + "policy": "publish evidence ids and bounded score summaries only; do not publish private text", + "private_fixture_count": 1 + }, + "evolution": { + "stale_answer_count": 0, + "conflict_detection_count": 0, + "update_rationale_available_count": 0, + "temporal_validity_not_encoded_count": 0, + "history_readback_encoded_count": 0 + }, + "follow_ups": [ + { + "suite_id": "production_ops", + "job_id": "production-ops-credential-boundary-001", + "title": "Run provider-backed production-ops gate with routed operator credentials", + "reason": "Credential-bound checks need an operator shell with provider environment variables; fixture reports can only encode the boundary." + }, + { + "suite_id": "production_ops", + "job_id": "production-ops-private-manifest-blocked-001", + "title": "Supply an operator-owned private production corpus manifest", + "reason": "A real private-corpus pass requires a sanitized local manifest supplied outside checked-in fixtures." + } + ] +} \ No newline at end of file diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark.rs b/apps/elf-eval/src/bin/real_world_job_benchmark.rs index c2a2bd54..c2c08696 100644 --- a/apps/elf-eval/src/bin/real_world_job_benchmark.rs +++ b/apps/elf-eval/src/bin/real_world_job_benchmark.rs @@ -21,6 +21,7 @@ const REPORT_SCHEMA: &str = "elf.real_world_job_report/v1"; const EXTERNAL_ADAPTER_MANIFEST_SCHEMA: &str = "elf.real_world_external_adapter_manifest/v1"; const EXTERNAL_ADAPTER_REPORT_SCHEMA: &str = "elf.real_world_external_adapter_report/v1"; const SCOREBOARD_SCHEMA: &str = "elf.quality_scoreboard/v1"; +const OPERATIONAL_EVIDENCE_SCHEMA: &str = "elf.operational_evidence_gates/v1"; const DEFAULT_FIXTURE_PATH: &str = "apps/elf-eval/fixtures/real_world_memory/work_resume"; const DEFAULT_REPORT_PATH: &str = "tmp/real-world-job/real-world-job-smoke-report.json"; const DEFAULT_MARKDOWN_PATH: &str = "tmp/real-world-job/real-world-job-smoke-report.md"; @@ -74,6 +75,8 @@ const SCOREBOARD_RESULT_STATES: &[&str] = &[ ]; const SCOREBOARD_EVIDENCE_CLASSES: &[&str] = &["fixture_backed", "live_baseline", "live_real_world", "research_gate"]; +const OPERATIONAL_EVIDENCE_TIERS: &[&str] = + &["local_fixture", "public_proxy", "private_corpus", "provider_backed"]; #[derive(Debug, Parser)] #[command( @@ -831,6 +834,8 @@ struct RealWorldReport { #[serde(default)] scoreboard: ScoreboardReport, #[serde(default)] + operational_evidence: OperationalEvidenceReport, + #[serde(default)] external_adapters: ExternalAdapterSection, capture_integration: CaptureIntegrationReport, summary: ReportSummary, @@ -863,6 +868,84 @@ struct ScoreboardReport { claim_boundary: String, } +#[derive(Clone, Debug, Default, Deserialize, Serialize)] +struct OperationalEvidenceReport { + schema: String, + #[serde(default)] + tiers: Vec, + latency: OperationalLatencyReport, + cost: OperationalCostSummary, + resource: OperationalResourceSummary, + cold_start_restore_rebuild: OperationalColdStartRestoreRebuild, + missing_private_provider_inputs_are_typed_blockers: bool, + private_corpus_pass_claim_allowed: bool, + provider_backed_pass_claim_allowed: bool, + claim_boundary: String, +} + +#[derive(Clone, Debug, Deserialize, Serialize)] +struct OperationalEvidenceTierReport { + tier: String, + status: TypedStatus, + job_count: usize, + pass: usize, + wrong_result: usize, + lifecycle_fail: usize, + incomplete: usize, + blocked: usize, + not_encoded: usize, + unsupported_claim: usize, + mean_latency_ms: Option, + total_cost: Option, + resource_evidence_count: usize, + cold_start_evidence_count: usize, + restore_evidence_count: usize, + qdrant_rebuild_evidence_count: usize, + pass_claim_allowed: bool, + #[serde(default)] + blocker_reasons: Vec, + #[serde(default)] + job_ids: Vec, +} + +#[derive(Clone, Debug, Default, Deserialize, Serialize)] +struct OperationalLatencyReport { + measured_job_count: usize, + missing_latency_job_count: usize, + mean_ms: Option, + max_ms: Option, +} + +#[derive(Clone, Debug, Default, Deserialize, Serialize)] +struct OperationalCostSummary { + jobs_with_cost_report: usize, + missing_cost_job_count: usize, + zero_cost_job_count: usize, + total: Option, + claim_boundary: String, +} + +#[derive(Clone, Debug, Default, Deserialize, Serialize)] +struct OperationalResourceSummary { + resource_envelope_job_count: usize, + resource_envelope_pass_count: usize, + latency_resource_dimension_job_count: usize, + #[serde(default)] + job_ids: Vec, +} + +#[derive(Clone, Debug, Default, Deserialize, Serialize)] +struct OperationalColdStartRestoreRebuild { + cold_start_job_count: usize, + cold_start_pass_count: usize, + restore_job_count: usize, + restore_pass_count: usize, + qdrant_rebuild_job_count: usize, + qdrant_rebuild_pass_count: usize, + #[serde(default)] + job_ids: Vec, +} + #[derive(Debug, Deserialize, Serialize)] struct AdapterReport { adapter_id: String, @@ -1348,6 +1431,7 @@ struct JobReport { job_id: String, title: String, status: TypedStatus, + operational_evidence_tier: String, answer_type: String, requires_caveat: bool, requires_refusal: bool, @@ -3206,6 +3290,7 @@ fn build_report(jobs: &[RealWorldJob], args: &RunArgs) -> Result Result JobReport { job_id: job.job_id.clone(), title: job.title.clone(), status: scoring.status, + operational_evidence_tier: operational_evidence_tier(job).to_string(), answer_type: job.expected_answer.answer_type.clone(), requires_caveat: job.expected_answer.requires_caveat, requires_refusal: job.expected_answer.requires_refusal, @@ -5609,6 +5696,220 @@ fn scoreboard_summary_claim(jobs: &[JobReport], typed_non_pass_count: usize) -> } } +fn operational_evidence_report( + jobs: &[RealWorldJob], + reports: &[JobReport], +) -> OperationalEvidenceReport { + let paired = jobs.iter().zip(reports.iter()).collect::>(); + let tiers = OPERATIONAL_EVIDENCE_TIERS + .iter() + .map(|tier| operational_evidence_tier_report(tier, paired.as_slice())) + .collect::>(); + let private_tier = tiers.iter().find(|tier| tier.tier == "private_corpus"); + let provider_tier = tiers.iter().find(|tier| tier.tier == "provider_backed"); + let private_corpus_pass_claim_allowed = + private_tier.is_some_and(|tier| tier.pass_claim_allowed); + let provider_backed_pass_claim_allowed = + provider_tier.is_some_and(|tier| tier.pass_claim_allowed); + let missing_private_provider_inputs_are_typed_blockers = private_tier + .is_some_and(operational_tier_has_typed_blocker) + && provider_tier.is_some_and(operational_tier_has_typed_blocker); + + OperationalEvidenceReport { + schema: OPERATIONAL_EVIDENCE_SCHEMA.to_string(), + tiers, + latency: operational_latency_report(reports), + cost: operational_cost_summary(reports), + resource: operational_resource_summary(paired.as_slice()), + cold_start_restore_rebuild: operational_cold_start_restore_rebuild(paired.as_slice()), + missing_private_provider_inputs_are_typed_blockers, + private_corpus_pass_claim_allowed, + provider_backed_pass_claim_allowed, + claim_boundary: "Operational evidence tiers are separate: local fixture and public-proxy passes do not prove private-corpus or provider-backed production quality.".to_string(), + } +} + +fn operational_evidence_tier_report( + tier: &str, + paired: &[(&RealWorldJob, &JobReport)], +) -> OperationalEvidenceTierReport { + let tier_jobs = paired + .iter() + .filter(|(job, _)| operational_evidence_tier(job) == tier) + .copied() + .collect::>(); + let reports = tier_jobs.iter().map(|(_, report)| *report).collect::>(); + let status = if reports.is_empty() { + TypedStatus::NotEncoded + } else { + aggregate_status(reports.as_slice()) + }; + let job_count = reports.len(); + let pass = reports.iter().filter(|report| report.status == TypedStatus::Pass).count(); + let wrong_result = + reports.iter().filter(|report| report.status == TypedStatus::WrongResult).count(); + let lifecycle_fail = + reports.iter().filter(|report| report.status == TypedStatus::LifecycleFail).count(); + let incomplete = + reports.iter().filter(|report| report.status == TypedStatus::Incomplete).count(); + let blocked = reports.iter().filter(|report| report.status == TypedStatus::Blocked).count(); + let not_encoded = usize::from(reports.is_empty()) + + reports.iter().filter(|report| report.status == TypedStatus::NotEncoded).count(); + let unsupported_claim = + reports.iter().filter(|report| report.status == TypedStatus::UnsupportedClaim).count(); + + OperationalEvidenceTierReport { + tier: tier.to_string(), + status, + job_count, + pass, + wrong_result, + lifecycle_fail, + incomplete, + blocked, + not_encoded, + unsupported_claim, + mean_latency_ms: mean_latency_for_reports(reports.as_slice()), + total_cost: total_cost_for_reports(reports.as_slice()), + resource_evidence_count: tier_jobs + .iter() + .filter(|(job, _)| job_has_tag(job, "resource_envelope")) + .count(), + cold_start_evidence_count: tier_jobs + .iter() + .filter(|(job, _)| job_has_tag(job, "cold_start")) + .count(), + restore_evidence_count: tier_jobs + .iter() + .filter(|(job, _)| job_has_tag(job, "restore")) + .count(), + qdrant_rebuild_evidence_count: tier_jobs + .iter() + .filter(|(job, report)| { + job_has_tag(job, "qdrant_rebuild") || report.qdrant_rebuild_case + }) + .count(), + pass_claim_allowed: job_count > 0 && status == TypedStatus::Pass, + blocker_reasons: reports + .iter() + .filter(|report| report.status != TypedStatus::Pass) + .map(|report| report.reason.clone()) + .collect(), + job_ids: reports.iter().map(|report| report.job_id.clone()).collect(), + } +} + +fn operational_tier_has_typed_blocker(tier: &OperationalEvidenceTierReport) -> bool { + tier.blocked + tier.incomplete + tier.not_encoded > 0 && !tier.pass_claim_allowed +} + +fn operational_latency_report(reports: &[JobReport]) -> OperationalLatencyReport { + let latencies = reports.iter().filter_map(|report| report.latency_ms).collect::>(); + + OperationalLatencyReport { + measured_job_count: latencies.len(), + missing_latency_job_count: reports.len().saturating_sub(latencies.len()), + mean_ms: mean_latency_for_values(latencies.as_slice()), + max_ms: latencies.iter().copied().reduce(f64::max).map(round3), + } +} + +fn operational_cost_summary(reports: &[JobReport]) -> OperationalCostSummary { + let costs = reports.iter().filter_map(|report| report.cost.as_ref()).collect::>(); + let zero_cost_job_count = + costs.iter().filter(|cost| cost.amount.is_some_and(|amount| amount == 0.0)).count(); + + OperationalCostSummary { + jobs_with_cost_report: costs.len(), + missing_cost_job_count: reports.len().saturating_sub(costs.len()), + zero_cost_job_count, + total: total_cost(reports), + claim_boundary: "Fixture and local-provider zero-cost reports are execution-accounting evidence only; they do not prove hosted provider spend.".to_string(), + } +} + +fn operational_resource_summary( + paired: &[(&RealWorldJob, &JobReport)], +) -> OperationalResourceSummary { + let resource_jobs = + paired.iter().filter(|(job, _)| job_has_tag(job, "resource_envelope")).collect::>(); + let latency_resource_dimension_job_count = paired + .iter() + .filter(|(_, report)| { + report.dimension_scores.iter().any(|score| score.dimension == "latency_resource") + }) + .count(); + + OperationalResourceSummary { + resource_envelope_job_count: resource_jobs.len(), + resource_envelope_pass_count: resource_jobs + .iter() + .filter(|(_, report)| report.status == TypedStatus::Pass) + .count(), + latency_resource_dimension_job_count, + job_ids: resource_jobs.iter().map(|(_, report)| report.job_id.clone()).collect(), + } +} + +fn operational_cold_start_restore_rebuild( + paired: &[(&RealWorldJob, &JobReport)], +) -> OperationalColdStartRestoreRebuild { + let cold_start_jobs = + paired.iter().filter(|(job, _)| job_has_tag(job, "cold_start")).collect::>(); + let restore_jobs = + paired.iter().filter(|(job, _)| job_has_tag(job, "restore")).collect::>(); + let qdrant_rebuild_jobs = paired + .iter() + .filter(|(job, report)| job_has_tag(job, "qdrant_rebuild") || report.qdrant_rebuild_case) + .collect::>(); + let mut job_ids = cold_start_jobs + .iter() + .chain(restore_jobs.iter()) + .chain(qdrant_rebuild_jobs.iter()) + .map(|(_, report)| report.job_id.clone()) + .collect::>() + .into_iter() + .collect::>(); + + job_ids.sort(); + OperationalColdStartRestoreRebuild { + cold_start_job_count: cold_start_jobs.len(), + cold_start_pass_count: cold_start_jobs + .iter() + .filter(|(_, report)| report.status == TypedStatus::Pass) + .count(), + restore_job_count: restore_jobs.len(), + restore_pass_count: restore_jobs + .iter() + .filter(|(_, report)| report.status == TypedStatus::Pass) + .count(), + qdrant_rebuild_job_count: qdrant_rebuild_jobs.len(), + qdrant_rebuild_pass_count: qdrant_rebuild_jobs + .iter() + .filter(|(_, report)| report.status == TypedStatus::Pass) + .count(), + job_ids, + } +} + +fn operational_evidence_tier(job: &RealWorldJob) -> &'static str { + if job_has_tag(job, "provider_backed") { + "provider_backed" + } else if job_has_tag(job, "private_corpus") + || matches!(job.corpus.profile, CorpusProfile::PrivateSanitized) + { + "private_corpus" + } else if job_has_tag(job, "public_proxy") { + "public_proxy" + } else { + "local_fixture" + } +} + +fn job_has_tag(job: &RealWorldJob, tag: &str) -> bool { + job.tags.iter().any(|candidate| candidate == tag) +} + fn evolution_summary(jobs: &[JobReport]) -> EvolutionSummary { EvolutionSummary { stale_answer_count: jobs.iter().map(|job| job.stale_answer_count).sum(), @@ -6062,16 +6363,36 @@ fn mean_score(jobs: &[JobReport]) -> f64 { fn mean_latency(jobs: &[JobReport]) -> Option { let latencies = jobs.iter().filter_map(|job| job.latency_ms).collect::>(); + mean_latency_for_values(latencies.as_slice()) +} + +fn mean_latency_for_reports(jobs: &[&JobReport]) -> Option { + let latencies = jobs.iter().filter_map(|job| job.latency_ms).collect::>(); + + mean_latency_for_values(latencies.as_slice()) +} + +fn mean_latency_for_values(latencies: &[f64]) -> Option { if latencies.is_empty() { - return None; + None + } else { + Some(round3(latencies.iter().sum::() / latencies.len() as f64)) } - - Some(round3(latencies.iter().sum::() / latencies.len() as f64)) } fn total_cost(jobs: &[JobReport]) -> Option { let costs = jobs.iter().filter_map(|job| job.cost.as_ref()).collect::>(); + total_cost_for_values(costs.as_slice()) +} + +fn total_cost_for_reports(jobs: &[&JobReport]) -> Option { + let costs = jobs.iter().filter_map(|job| job.cost.as_ref()).collect::>(); + + total_cost_for_values(costs.as_slice()) +} + +fn total_cost_for_values(costs: &[&CostReport]) -> Option { if costs.is_empty() { return None; } @@ -6700,6 +7021,7 @@ fn render_markdown(report: &RealWorldReport, report_path: &Path) -> String { render_markdown_header(&mut out, report, report_path.as_str()); render_markdown_scoreboard(&mut out, report); + render_markdown_operational_evidence(&mut out, report); render_markdown_external_adapters(&mut out, report); render_markdown_capture_integration(&mut out, report); render_markdown_suites(&mut out, report); @@ -6775,6 +7097,99 @@ fn render_markdown_scoreboard(out: &mut String, report: &RealWorldReport) { )); } +fn render_markdown_operational_evidence(out: &mut String, report: &RealWorldReport) { + let evidence = &report.operational_evidence; + + if evidence.schema.is_empty() { + return; + } + + out.push_str("## Operational Evidence Gates\n\n"); + out.push_str("This section separates operational evidence tiers so local fixture or public-proxy passes do not become private-corpus or provider-backed proof.\n\n"); + out.push_str(&format!("- Schema: `{}`\n", md_inline(evidence.schema.as_str()))); + out.push_str(&format!("- Claim boundary: {}\n", md_cell(evidence.claim_boundary.as_str()))); + out.push_str(&format!( + "- Missing private/provider inputs are typed blockers: `{}`\n", + evidence.missing_private_provider_inputs_are_typed_blockers + )); + out.push_str(&format!( + "- Private-corpus pass claim allowed: `{}`\n", + evidence.private_corpus_pass_claim_allowed + )); + out.push_str(&format!( + "- Provider-backed pass claim allowed: `{}`\n", + evidence.provider_backed_pass_claim_allowed + )); + out.push_str(&format!( + "- Latency: `{}` measured job(s), `{}` missing, mean `{}`, max `{}`\n", + evidence.latency.measured_job_count, + evidence.latency.missing_latency_job_count, + optional_f64(evidence.latency.mean_ms, " ms"), + optional_f64(evidence.latency.max_ms, " ms") + )); + out.push_str(&format!( + "- Cost: `{}` job(s) reported cost, `{}` missing, `{}` zero-cost; total `{}`\n", + evidence.cost.jobs_with_cost_report, + evidence.cost.missing_cost_job_count, + evidence.cost.zero_cost_job_count, + cost_display(evidence.cost.total.as_ref()) + )); + out.push_str(&format!("- Cost boundary: {}\n", md_cell(evidence.cost.claim_boundary.as_str()))); + out.push_str(&format!( + "- Resource envelope jobs: `{}` total, `{}` pass; latency/resource dimensions `{}`\n", + evidence.resource.resource_envelope_job_count, + evidence.resource.resource_envelope_pass_count, + evidence.resource.latency_resource_dimension_job_count + )); + out.push_str(&format!( + "- Cold-start/restore/rebuild: cold-start `{}`/`{}` pass, restore `{}`/`{}` pass, Qdrant rebuild `{}`/`{}` pass\n\n", + evidence.cold_start_restore_rebuild.cold_start_pass_count, + evidence.cold_start_restore_rebuild.cold_start_job_count, + evidence.cold_start_restore_rebuild.restore_pass_count, + evidence.cold_start_restore_rebuild.restore_job_count, + evidence.cold_start_restore_rebuild.qdrant_rebuild_pass_count, + evidence.cold_start_restore_rebuild.qdrant_rebuild_job_count + )); + out.push_str("| Evidence Tier | Status | Jobs | Pass | Blocked | Incomplete | Not Encoded | Mean Latency | Cost | Resource | Cold Start | Restore | Qdrant Rebuild | Pass Claim |\n"); + out.push_str("| --- | --- | ---: | ---: | ---: | ---: | ---: | --- | --- | ---: | ---: | ---: | ---: | --- |\n"); + + for tier in &evidence.tiers { + out.push_str(&format!( + "| `{}` | `{}` | {} | {} | {} | {} | {} | `{}` | `{}` | {} | {} | {} | {} | `{}` |\n", + md_inline(tier.tier.as_str()), + status_str(tier.status), + tier.job_count, + tier.pass, + tier.blocked, + tier.incomplete, + tier.not_encoded, + optional_f64(tier.mean_latency_ms, " ms"), + cost_display(tier.total_cost.as_ref()), + tier.resource_evidence_count, + tier.cold_start_evidence_count, + tier.restore_evidence_count, + tier.qdrant_rebuild_evidence_count, + tier.pass_claim_allowed + )); + } + + if evidence.tiers.iter().any(|tier| !tier.blocker_reasons.is_empty()) { + out.push_str("\nTyped blocker reasons:\n"); + + for tier in &evidence.tiers { + for reason in &tier.blocker_reasons { + out.push_str(&format!( + "- `{}`: {}\n", + md_inline(tier.tier.as_str()), + md_cell(reason) + )); + } + } + } + + out.push('\n'); +} + fn render_markdown_capture_integration(out: &mut String, report: &RealWorldReport) { out.push_str("## Capture And Integration Coverage\n\n"); diff --git a/apps/elf-eval/tests/real_world_job_benchmark.rs b/apps/elf-eval/tests/real_world_job_benchmark.rs index 6d621005..248c3ba5 100644 --- a/apps/elf-eval/tests/real_world_job_benchmark.rs +++ b/apps/elf-eval/tests/real_world_job_benchmark.rs @@ -2907,7 +2907,7 @@ fn assert_live_sweep_record(adapter: &Value, production_ops_status: &str) -> Res fn runner_discovers_nested_fixture_layout() -> Result<()> { let report = run_json_report_from(fixture_root())?; - assert_eq!(report.pointer("/summary/job_count").and_then(Value::as_u64), Some(72)); + assert_eq!(report.pointer("/summary/job_count").and_then(Value::as_u64), Some(73)); Ok(()) } @@ -7555,8 +7555,8 @@ fn scheduled_memory_fixture_fails_source_mutation() -> Result<()> { fn production_ops_fixtures_report_bounded_typed_states() -> Result<()> { let report = run_json_report_from(production_ops_fixture_dir())?; - assert_eq!(report.pointer("/summary/job_count").and_then(Value::as_u64), Some(6)); - assert_eq!(report.pointer("/summary/pass").and_then(Value::as_u64), Some(4)); + assert_eq!(report.pointer("/summary/job_count").and_then(Value::as_u64), Some(7)); + assert_eq!(report.pointer("/summary/pass").and_then(Value::as_u64), Some(5)); assert_eq!(report.pointer("/summary/incomplete").and_then(Value::as_u64), Some(0)); assert_eq!(report.pointer("/summary/blocked").and_then(Value::as_u64), Some(2)); assert_eq!(report.pointer("/summary/not_encoded").and_then(Value::as_u64), Some(0)); @@ -7574,11 +7574,12 @@ fn production_ops_fixtures_report_bounded_typed_states() -> Result<()> { let production_ops = find_by_field(suites, "/suite_id", "production_ops")?; assert_eq!(production_ops.pointer("/status").and_then(Value::as_str), Some("blocked")); - assert_eq!(production_ops.pointer("/encoded_job_count").and_then(Value::as_u64), Some(6)); + assert_eq!(production_ops.pointer("/encoded_job_count").and_then(Value::as_u64), Some(7)); let jobs = array_at(&report, "/jobs")?; let backfill = find_by_field(jobs, "/job_id", "production-ops-backfill-resume-001")?; let restore = find_by_field(jobs, "/job_id", "production-ops-restore-cold-start-001")?; + let public_proxy = find_by_field(jobs, "/job_id", "production-ops-public-proxy-addendum-001")?; let private_manifest = find_by_field(jobs, "/job_id", "production-ops-private-manifest-blocked-001")?; let credentials = find_by_field(jobs, "/job_id", "production-ops-credential-boundary-001")?; @@ -7587,9 +7588,79 @@ fn production_ops_fixtures_report_bounded_typed_states() -> Result<()> { assert_eq!(backfill.pointer("/status").and_then(Value::as_str), Some("pass")); assert_eq!(restore.pointer("/status").and_then(Value::as_str), Some("pass")); assert_eq!(restore.pointer("/qdrant_rebuild_case").and_then(Value::as_bool), Some(true)); + assert_eq!(public_proxy.pointer("/status").and_then(Value::as_str), Some("pass")); + assert_eq!( + public_proxy.pointer("/operational_evidence_tier").and_then(Value::as_str), + Some("public_proxy") + ); assert_eq!(private_manifest.pointer("/status").and_then(Value::as_str), Some("blocked")); + assert_eq!( + private_manifest.pointer("/operational_evidence_tier").and_then(Value::as_str), + Some("private_corpus") + ); assert_eq!(credentials.pointer("/status").and_then(Value::as_str), Some("blocked")); + assert_eq!( + credentials.pointer("/operational_evidence_tier").and_then(Value::as_str), + Some("provider_backed") + ); assert_eq!(dependency.pointer("/status").and_then(Value::as_str), Some("pass")); + assert_eq!( + report.pointer("/operational_evidence/schema").and_then(Value::as_str), + Some("elf.operational_evidence_gates/v1") + ); + assert_eq!( + report + .pointer("/operational_evidence/missing_private_provider_inputs_are_typed_blockers") + .and_then(Value::as_bool), + Some(true) + ); + assert_eq!( + report + .pointer("/operational_evidence/private_corpus_pass_claim_allowed") + .and_then(Value::as_bool), + Some(false) + ); + assert_eq!( + report + .pointer("/operational_evidence/provider_backed_pass_claim_allowed") + .and_then(Value::as_bool), + Some(false) + ); + assert_eq!( + report.pointer("/operational_evidence/latency/measured_job_count").and_then(Value::as_u64), + Some(7) + ); + assert_eq!( + report.pointer("/operational_evidence/cost/jobs_with_cost_report").and_then(Value::as_u64), + Some(7) + ); + assert_eq!( + report + .pointer("/operational_evidence/resource/resource_envelope_job_count") + .and_then(Value::as_u64), + Some(2) + ); + assert_eq!( + report + .pointer("/operational_evidence/cold_start_restore_rebuild/qdrant_rebuild_pass_count") + .and_then(Value::as_u64), + Some(1) + ); + + let tiers = array_at(&report, "/operational_evidence/tiers")?; + let local_fixture = find_by_field(tiers, "/tier", "local_fixture")?; + let public_proxy_tier = find_by_field(tiers, "/tier", "public_proxy")?; + let private_corpus = find_by_field(tiers, "/tier", "private_corpus")?; + let provider_backed = find_by_field(tiers, "/tier", "provider_backed")?; + + assert_eq!(local_fixture.pointer("/status").and_then(Value::as_str), Some("pass")); + assert_eq!(local_fixture.pointer("/job_count").and_then(Value::as_u64), Some(4)); + assert_eq!(public_proxy_tier.pointer("/status").and_then(Value::as_str), Some("pass")); + assert_eq!(public_proxy_tier.pointer("/job_count").and_then(Value::as_u64), Some(1)); + assert_eq!(private_corpus.pointer("/status").and_then(Value::as_str), Some("blocked")); + assert_eq!(private_corpus.pointer("/blocked").and_then(Value::as_u64), Some(1)); + assert_eq!(provider_backed.pointer("/status").and_then(Value::as_str), Some("blocked")); + assert_eq!(provider_backed.pointer("/blocked").and_then(Value::as_u64), Some(1)); Ok(()) } @@ -7819,9 +7890,9 @@ fn assert_root_knowledge_summary(report: &Value) { } fn assert_root_aggregate_summary(report: &Value) -> Result<()> { - assert_eq!(report.pointer("/summary/job_count").and_then(Value::as_u64), Some(72)); + assert_eq!(report.pointer("/summary/job_count").and_then(Value::as_u64), Some(73)); assert_eq!(report.pointer("/summary/encoded_suite_count").and_then(Value::as_u64), Some(18)); - assert_eq!(report.pointer("/summary/pass").and_then(Value::as_u64), Some(65)); + assert_eq!(report.pointer("/summary/pass").and_then(Value::as_u64), Some(66)); assert_eq!(report.pointer("/summary/wrong_result").and_then(Value::as_u64), Some(0)); assert_eq!(report.pointer("/summary/incomplete").and_then(Value::as_u64), Some(0)); assert_eq!(report.pointer("/summary/blocked").and_then(Value::as_u64), Some(7)); @@ -7864,11 +7935,11 @@ fn assert_root_aggregate_summary(report: &Value) -> Result<()> { ); assert_eq!( report.pointer("/summary/evidence_required_count").and_then(Value::as_u64), - Some(162) + Some(165) ); assert_eq!( report.pointer("/summary/evidence_covered_count").and_then(Value::as_u64), - Some(162) + Some(165) ); assert_eq!(report.pointer("/summary/evidence_coverage").and_then(Value::as_f64), Some(1.0)); assert_eq!(report.pointer("/summary/source_ref_coverage").and_then(Value::as_f64), Some(1.0)); @@ -8098,7 +8169,7 @@ fn assert_root_aggregate_suites(report: &Value) -> Result<()> { let production_ops = find_by_field(suites, "/suite_id", "production_ops")?; assert_eq!(production_ops.pointer("/status").and_then(Value::as_str), Some("blocked")); - assert_eq!(production_ops.pointer("/encoded_job_count").and_then(Value::as_u64), Some(6)); + assert_eq!(production_ops.pointer("/encoded_job_count").and_then(Value::as_u64), Some(7)); let proactive = find_by_field(suites, "/suite_id", "proactive_brief")?; diff --git a/docs/evidence/benchmarking/2026-06-23-p4-production-readiness-evidence-gates-report.md b/docs/evidence/benchmarking/2026-06-23-p4-production-readiness-evidence-gates-report.md new file mode 100644 index 00000000..007fdf98 --- /dev/null +++ b/docs/evidence/benchmarking/2026-06-23-p4-production-readiness-evidence-gates-report.md @@ -0,0 +1,494 @@ +--- +type: Evidence +title: "P4 Production-Readiness Evidence Gates Report - June 23, 2026" +description: "Record P4 latency, cost, resource, public-proxy, private-corpus, provider-backed, restore, and rebuild evidence gates." +resource: docs/evidence/benchmarking/2026-06-23-p4-production-readiness-evidence-gates-report.md +status: active +authority: evidence +owner: benchmarking +last_verified: 2026-06-23 +tags: + - docs + - evidence + - benchmarking + - p4-production-readiness +source_refs: + - apps/elf-eval/fixtures/report_snapshots/2026-06-23-p4-production-readiness-evidence-gates-report.json + - apps/elf-eval/fixtures/real_world_memory/production_ops/ +code_refs: + - Makefile.toml + - apps/elf-eval/src/bin/real_world_job_benchmark.rs +related: + - docs/spec/real_world_agent_memory_benchmark_v1.md + - docs/evidence/benchmarking/2026-06-19-operator-approved-public-proxy-production-private-addendum.md + - docs/evidence/benchmarking/2026-06-23-p3-competitor-strength-absorption-report.md +drift_watch: + - docs/evidence/benchmarking/2026-06-23-p4-production-readiness-evidence-gates-report.md + - apps/elf-eval/fixtures/report_snapshots/2026-06-23-p4-production-readiness-evidence-gates-report.json + - docs/evidence/benchmarking/index.md + - README.md +--- +# P4 Production-Readiness Evidence Gates Report - June 23, 2026 + +Goal: Publish a durable P4 production-readiness evidence-gate report. +Read this when: You need the checked-in latency, cost, resource, public-proxy, +private-corpus, provider-backed, restore, and rebuild evidence boundaries for the +production-ops slice. +Inputs: `apps/elf-eval/fixtures/report_snapshots/2026-06-23-p4-production-readiness-evidence-gates-report.json`. +Depends on: `apps/elf-eval/fixtures/`, `docs/spec/real_world_agent_memory_benchmark_v1.md`, and `Makefile.toml`. +Verification: Compare this Markdown summary with the source JSON before committing. + +## Summary + +- Run ID: `real-world-memory-p4-production-readiness` +- Generated at: `2026-06-22T21:44:10.104652Z` +- Runner version: `0.2.0-846880fe650fae351131c433cd45773485c5a383-aarch64-apple-darwin` +- Corpus profile: `mixed` +- Adapter: `fixture_production_ops` (offline_fixture_response) +- Jobs: `7` +- Suites with encoded jobs: `1` +- Suites with `not_encoded` status: `17` +- Status summary: `5` pass, `0` wrong_result, `0` lifecycle_fail, `0` incomplete, `2` blocked, `0` not_encoded, `0` unsupported_claim +- Unsupported claim count: `0` +- Wrong-result count: `0` +- Stale-answer count: `0` +- Conflict detections: `0` +- Update rationales available: `0` +- Temporal validity not encoded: `0` +- History readback encoded: `0` +- Evidence coverage: `18/18` (`1.000`) +- Source-ref coverage: `18/18` (`1.000`) +- Quote coverage: `18/18` (`1.000`) +- Stale retrieval count: `0` +- Scope correctness: `0/0` (`0.000`), violations `0` +- Redaction leak count: `0` +- Qdrant rebuild cases: `1` encoded, `1` pass +- Expected evidence recall: `1.000` (18/18) +- Irrelevant context ratio: `0.000` (0 irrelevant) +- Trace explainability: `0` job(s), `0` wrong-result stage attribution(s) +- Consolidation source mutation count: `0` +- Mean score: `0.714` +- Mean latency: `3.192 ms` +- Cost: `0.000 USD` +- Operator-debug jobs: `0` +- Raw SQL needed: `0` +- Trace-incomplete debug jobs: `0` +- Operator UX gaps: `0` +- Private corpus redaction: `publish evidence ids and bounded score summaries only; do not publish private text` + +## Quality Scoreboard Grammar + +The scoreboard is a claim grammar, not a leaderboard. A report may claim only the statuses and evidence classes represented by its source JSON. + +- Schema: `elf.quality_scoreboard/v1` +- Result states: `pass, wrong_result, incomplete, blocked, not_tested, not_encoded, unsupported_claim` +- Evidence classes: `fixture_backed, live_baseline, live_real_world, research_gate` +- Summary claim: `typed_non_pass_present` +- Job summary claim: `typed_non_pass_present` +- Job typed non-pass rows: `2` (blocked) +- External-adapter typed non-pass rows: `220` (blocked, incomplete, not_encoded, not_tested, wrong_result) +- Typed non-pass rows: `222` (blocked, incomplete, not_encoded, not_tested, wrong_result) +- Evidence class counts: `fixture_backed=1, live_baseline=6, live_real_world=5, research_gate=11` +- Unqualified win claim allowed: `false` +- Claim boundary: Typed non-pass states and non-live evidence classes must remain visible; reports must not collapse them into unqualified wins. + +## Operational Evidence Gates + +This section separates operational evidence tiers so local fixture or public-proxy passes do not become private-corpus or provider-backed proof. + +- Schema: `elf.operational_evidence_gates/v1` +- Claim boundary: Operational evidence tiers are separate: local fixture and public-proxy passes do not prove private-corpus or provider-backed production quality. +- Missing private/provider inputs are typed blockers: `true` +- Private-corpus pass claim allowed: `false` +- Provider-backed pass claim allowed: `false` +- Latency: `7` measured job(s), `0` missing, mean `3.192 ms`, max `10.843 ms` +- Cost: `7` job(s) reported cost, `0` missing, `7` zero-cost; total `0.000 USD` +- Cost boundary: Fixture and local-provider zero-cost reports are execution-accounting evidence only; they do not prove hosted provider spend. +- Resource envelope jobs: `2` total, `2` pass; latency/resource dimensions `2` +- Cold-start/restore/rebuild: cold-start `2`/`2` pass, restore `1`/`1` pass, Qdrant rebuild `1`/`1` pass + +| Evidence Tier | Status | Jobs | Pass | Blocked | Incomplete | Not Encoded | Mean Latency | Cost | Resource | Cold Start | Restore | Qdrant Rebuild | Pass Claim | +| --- | --- | ---: | ---: | ---: | ---: | ---: | --- | --- | ---: | ---: | ---: | ---: | --- | +| `local_fixture` | `pass` | 4 | 4 | 0 | 0 | 0 | `2.050 ms` | `0.000 USD` | 1 | 2 | 1 | 1 | `true` | +| `public_proxy` | `pass` | 1 | 1 | 0 | 0 | 0 | `10.843 ms` | `0.000 USD` | 1 | 0 | 0 | 0 | `true` | +| `private_corpus` | `blocked` | 1 | 0 | 1 | 0 | 0 | `1.600 ms` | `0.000 USD` | 0 | 0 | 0 | 0 | `false` | +| `provider_backed` | `blocked` | 1 | 0 | 1 | 0 | 0 | `1.700 ms` | `0.000 USD` | 0 | 0 | 0 | 0 | `false` | + +Typed blocker reasons: +- `private_corpus`: No operator-owned private production corpus manifest is checked in or available to this fixture; no private-corpus pass can be claimed. +- `provider_backed`: Provider-backed production operations require operator-owned credentials; checked-in fixtures must not include or require secrets. + +## External Adapter Coverage + +This section is manifest-backed. It records external adapter coverage and blockers, but it does not convert live-baseline retrieval results into real-world suite wins. + +- Manifest: `real-world-memory-project-adapters-2026-06-11-first-generation-continuity-source-store` +- Docker default: `true` via `docker-compose.baseline.yml`; artifact dir `tmp/live-baseline/` +- Adapter records: `23` total, `16` external project(s), `23` Docker-default, `0` requiring host-global installs +- Evidence classes: `1` fixture-backed, `6` live-baseline-only, `5` live real-world, `11` research-gate +- Overall statuses: `blocked=7, wrong_result=6, lifecycle_fail=1, pass=4, not_encoded=5` +- Capability coverage statuses: `real=8, mocked=1, unsupported=6, blocked=23, wrong_result=10, pass=30, not_encoded=26` +- Real-world suite statuses: `blocked=24, wrong_result=7, pass=27, not_encoded=37` +- Scenario coverage statuses: `unsupported=3, blocked=21, incomplete=5, wrong_result=6, lifecycle_fail=1, pass=23, not_encoded=13` +- ELF scenario positions: `wins=10, ties=11, loses=1, untested=50` +- Scenario comparison outcomes: `win=10, tie=11, loss=1, not_tested=19, blocked=26, non_goal=5` + +| Project | Adapter | Evidence Class | Overall | Setup | Run | Result | Docker | Suites | Evidence | +| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | +| ELF | `elf_real_world_memory_fixture` | `fixture_backed` | `blocked` | `pass` | `blocked` | `blocked` | `true` | `trust_source_of_truth`: `pass`
`work_resume`: `pass`
`project_decisions`: `pass`
`retrieval`: `pass`
`memory_evolution`: `pass`
`consolidation`: `pass`
`memory_summary`: `pass`
`proactive_brief`: `blocked`
`scheduled_memory`: `blocked`
`knowledge_compilation`: `pass`
`operator_debugging_ux`: `pass`
`capture_integration`: `pass`
`core_archival_memory`: `pass`
`production_ops`: `blocked`
`personalization`: `pass`
`context_trajectory`: `blocked` | setup: `cargo make real-world-memory`
result: `tmp/real-world-memory/real-world-memory-report.md` | +| ELF | `elf_live_real_world` | `live_real_world` | `wrong_result` | `pass` | `wrong_result` | `wrong_result` | `true` | `trust_source_of_truth`: `pass`
`work_resume`: `pass`
`retrieval`: `pass`
`project_decisions`: `pass`
`memory_evolution`: `wrong_result`
`consolidation`: `pass`
`knowledge_compilation`: `pass`
`operator_debugging_ux`: `pass`
`capture_integration`: `pass`
`production_ops`: `blocked`
`personalization`: `pass`
`core_archival_memory`: `not_encoded`
`context_trajectory`: `blocked` | setup: `cargo make real-world-memory-live-adapters`
result: `tmp/real-world-memory/live-adapters/elf-report.md` | +| qmd | `qmd_live_baseline` | `live_baseline_only` | `pass` | `pass` | `pass` | `pass` | `true` | `retrieval`: `not_encoded`
`memory_evolution`: `not_encoded`
`operator_debugging_ux`: `not_encoded` | setup: `ELF_BASELINE_PROJECTS=qmd cargo make baseline-live-docker`
result: `docs/runbook/benchmarking/live_baseline_benchmark.md` | +| qmd | `qmd_live_real_world` | `live_real_world` | `wrong_result` | `pass` | `wrong_result` | `wrong_result` | `true` | `trust_source_of_truth`: `pass`
`work_resume`: `pass`
`retrieval`: `pass`
`project_decisions`: `pass`
`memory_evolution`: `wrong_result`
`consolidation`: `not_encoded`
`knowledge_compilation`: `not_encoded`
`operator_debugging_ux`: `wrong_result`
`capture_integration`: `not_encoded`
`production_ops`: `blocked`
`personalization`: `pass`
`core_archival_memory`: `not_encoded`
`context_trajectory`: `blocked` | setup: `cargo make real-world-memory-live-adapters`
result: `tmp/real-world-memory/live-adapters/qmd-report.md` | +| ELF | `elf_operator_debug_live` | `live_real_world` | `pass` | `pass` | `pass` | `pass` | `true` | `operator_debugging_ux`: `pass` | setup: `cargo make real-world-job-operator-ux-live-adapters`
result: `tmp/real-world-job/operator-ux-live-adapters/elf-report.md` | +| qmd | `qmd_operator_debug_live` | `live_real_world` | `wrong_result` | `pass` | `wrong_result` | `wrong_result` | `true` | `operator_debugging_ux`: `wrong_result` | setup: `cargo make real-world-job-operator-ux-live-adapters`
result: `tmp/real-world-job/operator-ux-live-adapters/qmd-report.md` | +| agentmemory | `agentmemory_live_baseline` | `live_baseline_only` | `lifecycle_fail` | `pass` | `lifecycle_fail` | `lifecycle_fail` | `true` | `work_resume`: `blocked`
`capture_integration`: `blocked`
`memory_evolution`: `blocked` | setup: `ELF_BASELINE_PROJECTS=agentmemory cargo make baseline-live-docker`
result: `tmp/live-baseline/live-baseline-report.json` | +| mem0/OpenMemory | `mem0_openmemory_live_baseline` | `live_baseline_only` | `pass` | `pass` | `pass` | `pass` | `true` | `memory_evolution`: `not_encoded`
`personalization`: `not_encoded`
`operator_debugging_ux`: `blocked` | setup: `ELF_BASELINE_PROJECTS=mem0 cargo make baseline-live-docker`
result: `tmp/live-baseline/live-baseline-report.json` | +| memsearch | `memsearch_live_baseline` | `live_baseline_only` | `pass` | `pass` | `pass` | `pass` | `true` | `trust_source_of_truth`: `not_encoded`
`retrieval`: `not_encoded`
`memory_evolution`: `not_encoded` | setup: `ELF_BASELINE_PROJECTS=memsearch cargo make baseline-live-docker`
result: `tmp/live-baseline/live-baseline-report.json` | +| OpenViking | `openviking_live_baseline` | `live_baseline_only` | `wrong_result` | `pass` | `wrong_result` | `wrong_result` | `true` | `retrieval`: `wrong_result`
`work_resume`: `not_encoded`
`context_trajectory`: `blocked` | setup: `ELF_BASELINE_PROJECTS=OpenViking cargo make baseline-live-docker`
result: `docs/runbook/benchmarking/live_baseline_benchmark.md` | +| claude-mem | `claude_mem_live_baseline` | `live_baseline_only` | `wrong_result` | `pass` | `wrong_result` | `wrong_result` | `true` | `work_resume`: `not_encoded`
`operator_debugging_ux`: `blocked`
`capture_integration`: `blocked` | setup: `ELF_BASELINE_PROJECTS=claude-mem cargo make baseline-live-docker`
result: `tmp/live-baseline/live-baseline-report.json` | +| qmd | `qmd_deep_profile_gate` | `research_gate` | `not_encoded` | `pass` | `not_encoded` | `not_encoded` | `true` | `retrieval`: `not_encoded`
`operator_debugging_ux`: `not_encoded` | setup: `ELF_BASELINE_PROJECTS=qmd ELF_BASELINE_PROFILE=stress cargo make baseline-live-docker`
result: `docs/evidence/benchmarking/2026-06-11-qmd-openviking-strength-profile-report.md` | +| OpenViking | `openviking_deep_profile_gate` | `research_gate` | `blocked` | `pass` | `blocked` | `blocked` | `true` | `retrieval`: `wrong_result`
`context_trajectory`: `blocked`
`operator_debugging_ux`: `not_encoded` | setup: `ELF_BASELINE_PROJECTS=OpenViking cargo make baseline-live-docker`
result: `docs/evidence/benchmarking/2026-06-11-qmd-openviking-strength-profile-report.md` | +| RAGFlow | `ragflow_research_gate` | `research_gate` | `blocked` | `blocked` | `blocked` | `blocked` | `true` | `retrieval`: `blocked`
`knowledge_compilation`: `not_encoded`
`production_ops`: `blocked` | setup: `cargo make smoke-ragflow-docker`
result: `tmp/real-world-memory/ragflow-smoke/ragflow-report.json` | +| LightRAG | `lightrag_research_gate` | `research_gate` | `blocked` | `blocked` | `blocked` | `blocked` | `true` | `retrieval`: `blocked`
`memory_evolution`: `not_encoded`
`operator_debugging_ux`: `not_encoded` | setup: `cargo make smoke-lightrag-docker-context`
result: `tmp/real-world-memory/lightrag-context/lightrag-report.json` | +| GraphRAG | `graphrag_research_gate` | `research_gate` | `blocked` | `blocked` | `blocked` | `blocked` | `true` | `knowledge_compilation`: `blocked`
`retrieval`: `not_encoded`
`production_ops`: `not_encoded`
`memory_evolution`: `not_encoded` | setup: `cargo make smoke-graphrag-docker`
result: `tmp/real-world-memory/graphrag-smoke/graphrag-report.json` | +| Graphiti/Zep | `graphiti_zep_research_gate` | `research_gate` | `blocked` | `blocked` | `blocked` | `blocked` | `true` | `memory_evolution`: `blocked`
`retrieval`: `not_encoded`
`production_ops`: `not_encoded` | setup: `cargo make smoke-graphiti-zep-docker-temporal`
result: `tmp/real-world-memory/graphiti-zep-smoke/graphiti-zep-report.json` | +| Letta | `letta_research_gate` | `research_gate` | `blocked` | `blocked` | `blocked` | `blocked` | `true` | `personalization`: `not_encoded`
`project_decisions`: `blocked`
`work_resume`: `not_encoded`
`core_archival_memory`: `blocked` | setup: `cargo make smoke-letta-core-archive-export-readback`
result: `tmp/real-world-memory/letta-core-archive/report.json` | +| LangGraph | `langgraph_research_gate` | `research_gate` | `not_encoded` | `not_encoded` | `not_encoded` | `not_encoded` | `true` | `production_ops`: `not_encoded`
`work_resume`: `not_encoded` | setup: `LangGraph is D1 reviewed as a replay/checkpoint reference, not a direct memory backend adapter.`
result: `No production-ops or resume suite result is claimed.` | +| nanograph | `nanograph_research_gate` | `research_gate` | `not_encoded` | `not_encoded` | `not_encoded` | `not_encoded` | `true` | `memory_evolution`: `not_encoded`
`retrieval`: `not_encoded` | setup: `nanograph is D1 reviewed as typed graph DX, but no Docker adapter is implemented.`
result: `No graph temporal or retrieval-debug result is claimed.` | +| llm-wiki | `llm_wiki_research_gate` | `research_gate` | `not_encoded` | `not_encoded` | `not_encoded` | `not_encoded` | `true` | `knowledge_compilation`: `not_encoded`
`work_resume`: `not_encoded` | setup: `llm-wiki is D1 reviewed as a knowledge-compilation reference, but no plugin or generated-page adapter is implemented.`
result: `No knowledge page citation or lint result is claimed.` | +| gbrain | `gbrain_research_gate` | `research_gate` | `not_encoded` | `not_encoded` | `not_encoded` | `not_encoded` | `true` | `knowledge_compilation`: `not_encoded`
`operator_debugging_ux`: `not_encoded` | setup: `gbrain is D1 reviewed as a compiled-truth and timeline reference, but no Docker adapter is implemented.`
result: `No knowledge-synthesis or operator-continuity result is claimed.` | +| graphify | `graphify_docker_smoke` | `live_real_world` | `wrong_result` | `pass` | `pass` | `wrong_result` | `true` | `knowledge_compilation`: `wrong_result`
`retrieval`: `blocked`
`work_resume`: `not_encoded` | setup: `cargo make smoke-graphify-docker-graph-report`
result: `tmp/real-world-memory/graphify-smoke/graphify-report.json` | + +### Adapter Capability Details + +| Adapter | Capability | Status | Evidence | +| --- | --- | --- | --- | +| `elf_real_world_memory_fixture` | real_world_job_fixture_scoring | `real` | The runner scores checked-in real_world_job records with expected evidence, traps, and typed status output. | +| `elf_real_world_memory_fixture` | live_external_adapter_execution | `not_encoded` | The ELF fixture response path does not exercise an external memory project runtime. | +| `elf_real_world_memory_fixture` | docker_isolated_baseline | `pass` | ELF live baseline runs execute through docker-compose.baseline.yml for retrieval and lifecycle evidence. | +| `elf_live_real_world` | real_world_job_adapter | `pass` | The adapter executes real_world_job prompts after runtime ingestion and writes generated answer artifacts before scoring. | +| `elf_live_real_world` | service_runtime_execution | `real` | The materializer uses ElfService, Postgres, Qdrant, deterministic providers, worker indexing, and search_raw in Docker. | +| `elf_live_real_world` | targeted_live_pass | `pass` | The answer-retrieval suites from the original representative slice still pass: work_resume, retrieval, and project_decisions. | +| `elf_live_real_world` | full_suite_live_sweep | `wrong_result` | The runner now emits per-job and per-suite live records for all 55 checked-in jobs, including the operator-debug fixture tree, but memory_evolution is wrong_result and production/core/context boundaries remain typed non-pass. | +| `elf_live_real_world` | full_suite_live_pass | `wrong_result` | No full-suite live pass is claimed; generated reports preserve wrong_result, blocked, and not_encoded job outcomes. | +| `elf_live_real_world` | typed_failure_reporting | `pass` | Adapter setup/runtime limitations are materialized as typed jobs with evidence JSON instead of silent claim upgrades. | +| `qmd_live_baseline` | same_corpus_retrieval | `pass` | qmd has an encoded Docker same-corpus retrieval adapter. | +| `qmd_live_baseline` | update_delete_cold_start | `pass` | qmd lifecycle smoke checks are encoded in the live-baseline runner. | +| `qmd_live_baseline` | real_world_job_adapter | `not_encoded` | This live_baseline_only record does not execute real_world_job prompts; cite qmd_live_real_world for the full live real-world sweep. | +| `qmd_live_real_world` | real_world_job_adapter | `pass` | qmd executes real_world_job prompts through its local CLI retrieval/query workflow and records generated answer artifacts. | +| `qmd_live_real_world` | local_cli_retrieval | `real` | The adapter uses qmd collection add, update, embed -f, and query --json inside Docker. | +| `qmd_live_real_world` | targeted_live_pass | `pass` | The answer-retrieval suites from the original representative slice still pass: work_resume, retrieval, and project_decisions. | +| `qmd_live_real_world` | full_suite_live_sweep | `wrong_result` | The runner now emits per-job and per-suite live records for all 55 checked-in jobs, including the operator-debug fixture tree, but memory_evolution and operator_debugging_ux are wrong_result while non-qmd product surfaces remain typed not_encoded or blocked. | +| `qmd_live_real_world` | full_suite_live_pass | `wrong_result` | No full-suite live pass is claimed; generated reports preserve wrong_result, blocked, and not_encoded job outcomes. | +| `qmd_live_real_world` | typed_failure_reporting | `pass` | qmd setup/runtime limitations are materialized as typed jobs with command evidence and retry artifacts. | +| `elf_operator_debug_live` | operator_debug_real_world_job_adapter | `pass` | The adapter executes the checked-in operator_debugging_ux jobs through the live service materializer and generated scoring fixtures. | +| `elf_operator_debug_live` | trace_hydration_metadata | `pass` | Generated operator_debug records include service trace ids, viewer links, admin trace-bundle URLs, and trace_available=true. | +| `elf_operator_debug_live` | replay_command_metadata | `pass` | Generated operator_debug records include admin trace-bundle curl replay commands; no raw SQL path is required. | +| `elf_operator_debug_live` | candidate_drop_visibility | `pass` | The operator-debug jobs keep dropped-candidate visibility as explicit job-level evidence instead of relying on direct database inspection. | +| `elf_operator_debug_live` | openmemory_or_claude_mem_ui_runner | `not_encoded` | This ELF live slice does not launch OpenMemory or claude-mem UI flows. | +| `qmd_operator_debug_live` | operator_debug_real_world_job_adapter | `pass` | The adapter executes the checked-in operator_debugging_ux jobs through qmd local CLI materialization and generated scoring fixtures. | +| `qmd_operator_debug_live` | local_replay_command_metadata | `pass` | Generated operator_debug records include qmd query replay commands tied to per-job collections. | +| `qmd_operator_debug_live` | trace_hydration_metadata | `wrong_result` | Generated qmd operator_debug records have trace_available=false and no ELF viewer/admin trace bundle because qmd exposes local replay rows rather than service trace hydration. | +| `qmd_operator_debug_live` | candidate_drop_visibility | `wrong_result` | qmd top-k replay output is available, but intermediate candidate-drop stages are not exposed in the generated artifact. | +| `qmd_operator_debug_live` | openmemory_or_claude_mem_ui_runner | `not_encoded` | This qmd live slice does not launch OpenMemory or claude-mem UI flows. | +| `agentmemory_live_baseline` | same_corpus_retrieval | `pass` | The current adapter can run mem::remember and mem::search against the shared corpus. | +| `agentmemory_live_baseline` | adapter_storage | `mocked` | The current adapter uses a process-local StateKV Map and in-memory index. | +| `agentmemory_live_baseline` | durable_cold_start | `blocked` | A persistent upstream KV/index path or hosted runtime is needed before cold-start recovery can be fairly scored. | +| `agentmemory_live_baseline` | durable_work_resume_capture_path | `blocked` | XY-925 selects the next local path as a Docker-contained agentmemory session directory with persisted SDK KV store, observation log, and searchable index across a fresh process; the current StateKV Map and in-memory index still block scoring. | +| `agentmemory_live_baseline` | write_policy_hook_capture | `blocked` | Capture/write-policy jobs require live agentmemory hook observations plus persisted write-policy audit evidence. The current adapter does not execute those hooks. | +| `agentmemory_live_baseline` | real_world_job_adapter | `blocked` | XY-925 adds fixture-backed blocked prompt coverage for the required durable path, but no live agentmemory real_world_job adapter executes prompts until the persistent local store exists. | +| `mem0_openmemory_live_baseline` | local_storage | `real` | The adapter targets local FastEmbed, Qdrant path storage, and local history DB paths in Docker. | +| `mem0_openmemory_live_baseline` | same_corpus_retrieval | `pass` | Fresh scoped baseline run live-baseline-20260611122416 reports mem0 retrieval_pass with 3/3 same-corpus retrieval checks. | +| `mem0_openmemory_live_baseline` | local_lifecycle_update_delete_reload | `pass` | The Docker runner exercises public Memory.update, Memory.delete, and a new Memory.from_config over the same local Qdrant/history paths; the fresh scoped run reports those lifecycle checks passing. | +| `mem0_openmemory_live_baseline` | preference_correction_history | `pass` | The fresh scoped run reports preference_correction_history as pass: Memory.history preserved explicit ADD and UPDATE records with old and current preference text, and search returned only the current correction. | +| `mem0_openmemory_live_baseline` | entity_scoped_personalization | `pass` | The fresh scoped run reports entity_scoped_personalization as pass: user_id, agent_id, and run_id filters returned the ELF scoped preference and omitted a PubFi scoped preference. | +| `mem0_openmemory_live_baseline` | local_get_all_export_readback | `pass` | The fresh scoped run reports local_get_all_export_readback as pass: Memory.get_all returned the current scoped preference and omitted the other scope. | +| `mem0_openmemory_live_baseline` | deletion_audit_history | `pass` | The fresh scoped run reports delete_history_audit_readback as pass: Memory.history exposed a DELETE event and search suppressed the deleted memory. | +| `mem0_openmemory_live_baseline` | openmemory_ui_readback | `blocked` | XY-931 runs a bounded OpenMemory export-helper setup probe after the mem0 SDK corpus checks. The probe finds the OpenMemory tree, UI package, compose file, and export helper, then records a setup blocker because the export helper requires Docker access to a running OpenMemory container. Local SDK get_all readback is measured separately and must not be reused as UI evidence. | +| `mem0_openmemory_live_baseline` | hosted_managed_memory_claims | `unsupported` | Hosted mem0 Platform behavior and Platform UI export are outside the local OSS Docker adapter and are non-goals for this local evidence record. | +| `mem0_openmemory_live_baseline` | real_world_job_adapter | `not_encoded` | No mem0/OpenMemory adapter currently executes real_world_job prompts and answer scoring. | +| `mem0_openmemory_live_baseline` | optional_graph_memory | `not_encoded` | Optional graph memory is not enabled in the default local OSS path and remains an opt-in scenario gate rather than a default pass/fail claim. | +| `memsearch_live_baseline` | canonical_markdown_store | `real` | memsearch is tracked as a Markdown-first source-of-truth reference. | +| `memsearch_live_baseline` | same_corpus_retrieval | `pass` | Fresh comparable baseline run live-baseline-20260611061612 reports memsearch retrieval_pass with 3/3 same-corpus retrieval checks. | +| `memsearch_live_baseline` | reindex_update_delete_reload | `pass` | The runner rewrites auth-memory.md, deletes a second corpus file, reruns memsearch index, and starts fresh memsearch search processes; the fresh scoped run reports update, delete, and cold-start reload passing. | +| `memsearch_live_baseline` | real_world_job_adapter | `not_encoded` | XY-925 adds fixture-backed prompt coverage for the Markdown source-store and retrieval-debug jobs, but no live memsearch runtime adapter executes real_world_job prompts and answer scoring. | +| `memsearch_live_baseline` | markdown_source_store_prompt_jobs | `pass` | The first-generation OSS fixture slice encodes source-of-truth rebuild/reload and retrieval-debug prompts over the canonical Markdown store while preserving the live-baseline-only evidence boundary. | +| `openviking_live_baseline` | local_embed_setup | `pass` | Docker local embedding dependency setup is pinned to llama-cpp-python==0.3.28 from https://abetlen.github.io/llama-cpp-python/whl/cpu and reached import/runtime in the smoke run. | +| `openviking_live_baseline` | same_corpus_retrieval | `wrong_result` | OpenViking add_resource/find returned resources but missed expected evidence-term matches for every smoke query. | +| `openviking_live_baseline` | context_trajectory | `blocked` | OpenViking staged/hierarchical retrieval is now encoded as blocked context_trajectory fixtures until same-corpus expected evidence ids match and staged artifacts are materialized. | +| `openviking_live_baseline` | real_world_job_adapter | `not_encoded` | No OpenViking adapter currently executes real_world_job prompts and answer scoring. | +| `claude_mem_live_baseline` | same_corpus_retrieval | `wrong_result` | The current Docker adapter did not prove correct same-corpus retrieval. | +| `claude_mem_live_baseline` | durable_storage | `real` | The runner writes to a Docker-local SQLite file and constructs a new Database plus repository instances for cold-start recovery search. | +| `claude_mem_live_baseline` | repository_lifecycle | `real` | The runner uses MemoryItemsRepository.update, deletes from the repository-owned memory_items table, and relies on repository FTS triggers for update/delete checks. | +| `claude_mem_live_baseline` | repository_progressive_disclosure | `real` | The runner verifies search result to getById detail hydration and listSources source evidence on the durable repository path. | +| `claude_mem_live_baseline` | progressive_disclosure_real_world_job | `pass` | XY-925 adds fixture-backed prompt coverage for the Docker-contained repository progressive-disclosure path: search result to getById detail hydration and listSources evidence on durable SQLite. Hook, timeline, and viewer workflows remain blocked separately. | +| `claude_mem_live_baseline` | retrieval_repair_artifact | `wrong_result` | The same-corpus retrieval smoke remains wrong_result, and XY-925 records a repair prompt that tells operators to rerun ELF_BASELINE_PROJECTS=claude-mem cargo make baseline-live-docker before inspecting tmp/live-baseline/claude-mem.log and tmp/live-baseline/claude-mem-checks.json. | +| `claude_mem_live_baseline` | hook_capture_viewer_workflow | `blocked` | The current Docker runner does not launch claude-mem hooks, timeline capture, local viewer readback, or an operator workflow over the same corpus. | +| `qmd_deep_profile_gate` | stress_profile_retrieval_debug | `not_encoded` | The stress command path exists, but this adapter-pack gate has not published a deep qmd profile result. | +| `qmd_deep_profile_gate` | real_world_job_adapter | `not_encoded` | The qmd live real-world sweep covers the current encoded fixture corpus; expanded retrieval-debug strength suites still need their own materialized adapter run. | +| `qmd_deep_profile_gate` | host_global_install_boundary | `unsupported` | Repository-supported qmd benchmark runs must stay inside docker-compose.baseline.yml and must not require host-global installs. | +| `openviking_deep_profile_gate` | docker_local_embed_setup | `pass` | The local embedding setup is pinned and reaches import/runtime in Docker. | +| `openviking_deep_profile_gate` | hierarchical_context_trajectory | `blocked` | Stage trajectory scoring is encoded as blocked until the smoke adapter returns evidence-bearing same-corpus output and selected hierarchy/expansion artifacts. | +| `openviking_deep_profile_gate` | host_global_install_boundary | `unsupported` | The adapter pack must not ask operators to install OpenViking dependencies globally on the host. | +| `ragflow_research_gate` | adapter_candidate_verdict | `not_encoded` | XY-882 completed D1/D2 feasibility research and marks RAGFlow adapter_candidate; no adapter run is encoded. | +| `ragflow_research_gate` | docker_service_setup | `blocked` | The smoke records official Docker setup, image/disk/startup envelope, CPU/GPU mode, vm.max_map_count handling, provider boundaries, and retry behavior. | +| `ragflow_research_gate` | real_world_job_adapter | `blocked` | One generated retrieval job is scored from the smoke artifact or typed blocked when resource, service, or local API-key boundaries stop execution. | +| `ragflow_research_gate` | quality_or_scale_claim | `not_encoded` | The scored smoke does not claim broad RAGFlow quality, private corpus behavior, scale, or comparative ranking. | +| `lightrag_research_gate` | docker_service_setup | `blocked` | The opt-in compose profile records explicit LightRAG image, LLM, embedding, rerank, workspace, and Docker volume configuration without host-global installs. | +| `lightrag_research_gate` | retrieved_context_export | `blocked` | The materializer calls /documents/texts, waits on /documents/track_status, and queries /query with only_need_context plus chunk references when the service is reachable. | +| `lightrag_research_gate` | real_world_job_adapter | `blocked` | The LightRAG materializer rewrites generated retrieval fixtures with adapter_response evidence only when source paths or context map to required evidence ids. | +| `lightrag_research_gate` | quality_or_scale_claim | `not_encoded` | The smoke does not score broad graph-RAG quality, private corpora, scale, or comparative ranking claims. | +| `graphrag_research_gate` | indexing_resource_envelope | `blocked` | The smoke bounds the generated public corpus, timeout, GraphRAG package, model configuration, cache size, output size, elapsed time, and observed cache entries. | +| `graphrag_research_gate` | source_citation_mapping | `blocked` | The generated artifact maps GraphRAG documents, text_units, communities, community_reports, entities, and relationships parquet rows back to real_world_job evidence ids when available. | +| `graphrag_research_gate` | real_world_job_adapter | `blocked` | The smoke writes a generated real_world_job fixture and scored report; provider/setup limits remain blocked until live GraphRAG output maps to expected evidence ids. | +| `graphrag_research_gate` | quality_or_scale_claim | `not_encoded` | The smoke does not claim broad graph-navigation quality, knowledge-synthesis quality, private corpora, or large-corpus indexing. | +| `graphiti_zep_research_gate` | temporal_graph_memory | `blocked` | The smoke materializes generated current, historical, and rationale facts with validity windows, but the checked-in record stays blocked until a live artifact maps search output. | +| `graphiti_zep_research_gate` | docker_graph_store_setup | `blocked` | The task uses a Docker Compose graphiti-zep profile for FalkorDB and a container-local Python venv; no host-global graph database or hosted Zep service is used. | +| `graphiti_zep_research_gate` | real_world_job_adapter | `blocked` | The generated temporal-validity fixture is scored or typed blocked; live quality evidence requires Graphiti/Zep search output mapped to current and historical evidence ids. | +| `graphiti_zep_research_gate` | quality_or_scale_claim | `not_encoded` | The smoke does not claim broad graph-memory quality, managed Zep service behavior, private-corpus behavior, or large-corpus performance. | +| `letta_research_gate` | core_archival_memory | `blocked` | ELF fixture jobs score core block attachment, scope, provenance, stale-core detection, archival fallback, and project-decision recovery separately from archival note search; Letta remains blocked until its export maps equivalent source ids. | +| `letta_research_gate` | docker_embedding_configuration | `blocked` | Official Docker setup requires explicit embedding configuration before archival retrieval can be tested. | +| `letta_research_gate` | real_world_job_adapter | `blocked` | A Docker-contained materializer now exists and emits typed blocked evidence by default; live scoring still requires exported Letta core blocks, archival list/search JSON, and source-id mappings. | +| `letta_research_gate` | broad_letta_quality_claim | `not_encoded` | The materializer does not score broad Letta product quality, hosted/private state, personalization breadth, or production durability. | +| `langgraph_research_gate` | checkpoint_replay_regression | `not_encoded` | Replay/fork behavior needs an agent graph harness before scoring. | +| `langgraph_research_gate` | standalone_memory_backend | `unsupported` | LangGraph persistence is an agent-state/checkpoint layer, not a drop-in memory retrieval backend. | +| `langgraph_research_gate` | real_world_job_adapter | `not_encoded` | No LangGraph benchmark materializer exists. | +| `nanograph_research_gate` | typed_graph_schema | `not_encoded` | Schema-as-code and typed query ergonomics need a benchmark harness. | +| `nanograph_research_gate` | memory_backend_comparison | `unsupported` | nanograph is a graph database reference, not a complete agent memory service. | +| `nanograph_research_gate` | real_world_job_adapter | `not_encoded` | No nanograph materializer exists. | +| `llm_wiki_research_gate` | knowledge_page_compilation | `not_encoded` | Wiki generation and citation lint are not executed by the runner. | +| `llm_wiki_research_gate` | live_service_runtime | `unsupported` | llm-wiki is a plugin/workflow reference rather than a service adapter. | +| `llm_wiki_research_gate` | real_world_job_adapter | `not_encoded` | No page materializer or scorer mapping exists. | +| `gbrain_research_gate` | compiled_truth_timeline | `not_encoded` | Compiled truth plus timeline output is a reference pattern but not scored. | +| `gbrain_research_gate` | postgres_backed_brain_repo | `blocked` | A Docker-local brain repo and Postgres setup path must be proven before execution. | +| `gbrain_research_gate` | real_world_job_adapter | `not_encoded` | No gbrain materializer exists. | +| `graphify_docker_smoke` | docker_cli_boundary | `pass` | The smoke uses docker-compose.baseline.yml baseline-runner, a container-local Python venv, and isolated assistant config paths; it does not install host-global assistant hooks. | +| `graphify_docker_smoke` | graph_report_generation | `pass` | The smoke captures graphify-out/graph.json, GRAPH_REPORT.md, cache metadata, command logs, build time, graph size, and report size. | +| `graphify_docker_smoke` | real_world_job_adapter | `wrong_result` | The smoke writes a generated real_world_job fixture and scored report; current knowledge_compilation scoring is wrong_result, not pass. | +| `graphify_docker_smoke` | multimodal_code_graph | `not_encoded` | Multimodal extraction for videos, images, PDFs, or broad codebase understanding is a reference capability but not scored by this smoke. | +| `graphify_docker_smoke` | quality_or_scale_claim | `not_encoded` | The smoke does not claim broad graph quality, private corpus behavior, scale, or authoritative memory-store behavior. | + +### Adapter Scenario Judgments + +| Adapter | Scenario | Suite | Status | Outcome | Evidence | +| --- | --- | --- | --- | --- | --- | +| `elf_live_real_world` | `live_capture_write_policy` | `capture_integration` | `pass` | `tie` | ELF live capture/write-policy jobs pass for redaction, exclusions, source ids, evidence binding, and no secret leakage. This is an ELF self-check, not a win over external hook systems.
command: `cargo make real-world-memory-live-adapters`
artifact: `tmp/real-world-memory/live-adapters/elf-materialization.json` | +| `elf_live_real_world` | `live_consolidation_proposal_review` | `consolidation` | `pass` | `tie` | ELF live consolidation jobs now exercise source lineage, unsupported-claim flags, and apply/defer/discard review audit transitions. This is an ELF service self-check, not a broad competitor win.
command: `cargo make real-world-memory-live-adapters`
artifact: `tmp/real-world-memory/live-adapters/elf-materialization.json` | +| `elf_live_real_world` | `live_knowledge_page_rebuild_lint` | `knowledge_compilation` | `pass` | `tie` | ELF live knowledge jobs now exercise page rebuild, search, stale-source lint, citations, backlinks, and unsupported-section handling. This is an ELF service self-check, not a broad knowledge-product win.
command: `cargo make real-world-memory-live-adapters`
artifact: `tmp/real-world-memory/live-adapters/elf-materialization.json` | +| `elf_live_real_world` | `full_sweep_operator_debug` | `operator_debugging_ux` | `pass` | `win` | ELF full live sweep now includes the operator-debug fixture tree with hydrated trace ids, trace-bundle replay commands, dropped-candidate visibility, repair guidance, and no raw SQL requirement.
command: `cargo make real-world-memory-live-adapters`
artifact: `tmp/real-world-memory/live-adapters/elf-materialization.json` | +| `elf_operator_debug_live` | `operator_debug_trace_hydration` | `operator_debugging_ux` | `pass` | `win` | ELF generated trace_available=true, service trace ids, viewer URLs, and admin trace-bundle replay URLs for the operator-debug jobs; qmd has replay rows but no ELF trace hydration surface.
command: `cargo make real-world-job-operator-ux-live-adapters`
artifact: `tmp/real-world-job/operator-ux-live-adapters/elf-report.json` | +| `elf_operator_debug_live` | `operator_debug_replay_command` | `operator_debugging_ux` | `pass` | `tie` | ELF generated admin trace-bundle replay commands; qmd generated local CLI query replay commands. These are comparable replay-command availability artifacts, not equivalent UI quality claims.
command: `cargo make real-world-job-operator-ux-live-adapters`
artifact: `tmp/real-world-job/operator-ux-live-adapters/summary.json` | +| `elf_operator_debug_live` | `operator_debug_candidate_drop_visibility` | `operator_debugging_ux` | `pass` | `win` | ELF generated operator_debug candidate-drop visibility from trace and replay-candidate metadata without direct SQL assumptions; qmd keeps only top-k replay rows and lacks intermediate candidate-drop stages.
command: `cargo make real-world-job-operator-ux-live-adapters`
artifact: `tmp/real-world-job/operator-ux-live-adapters/elf-materialization.json` | +| `elf_operator_debug_live` | `operator_debug_repair_action_clarity` | `operator_debugging_ux` | `pass` | `tie` | ELF and qmd generated clear repair/replay steps for the narrow operator-debug jobs; OpenMemory UI/export remains blocked, and claude-mem UI repair paths remain blocked until Docker-contained hook/viewer evidence exists.
command: `cargo make real-world-job-operator-ux-live-adapters`
artifact: `tmp/real-world-job/operator-ux-live-adapters/summary.json` | +| `elf_operator_debug_live` | `operator_debug_selected_but_not_narrated` | `operator_debugging_ux` | `pass` | `win` | The new selected-but-not-narrated job scores whether selected trace evidence is available for answer-composition repair without direct database inspection.
command: `cargo make real-world-job-operator-ux-live-adapters`
artifact: `tmp/real-world-job/operator-ux-live-adapters/elf-report.json` | +| `qmd_operator_debug_live` | `operator_debug_trace_hydration` | `operator_debugging_ux` | `wrong_result` | `win` | qmd generated replay-command metadata but trace_available=false, so ELF wins only this trace-hydration dimension; this is not a broad qmd loss.
command: `cargo make real-world-job-operator-ux-live-adapters`
artifact: `tmp/real-world-job/operator-ux-live-adapters/qmd-report.json` | +| `qmd_operator_debug_live` | `operator_debug_replay_command` | `operator_debugging_ux` | `pass` | `tie` | qmd generated local CLI query replay commands for the same operator-debugging scenarios; ELF generated admin trace-bundle curl commands.
command: `cargo make real-world-job-operator-ux-live-adapters`
artifact: `tmp/real-world-job/operator-ux-live-adapters/summary.json` | +| `qmd_operator_debug_live` | `operator_debug_candidate_drop_visibility` | `operator_debugging_ux` | `wrong_result` | `win` | qmd generated top-k replay output but not intermediate retrieved-but-dropped stage visibility, so candidate-drop diagnosis remains a qmd wrong_result in this narrow slice.
command: `cargo make real-world-job-operator-ux-live-adapters`
artifact: `tmp/real-world-job/operator-ux-live-adapters/qmd-materialization.json` | +| `qmd_operator_debug_live` | `operator_debug_repair_action_clarity` | `operator_debugging_ux` | `pass` | `tie` | qmd generated clear local replay steps for repair investigation, matching ELF on repair-action clarity while differing on trace hydration.
command: `cargo make real-world-job-operator-ux-live-adapters`
artifact: `tmp/real-world-job/operator-ux-live-adapters/qmd-report.json` | +| `qmd_operator_debug_live` | `operator_debug_selected_but_not_narrated` | `operator_debugging_ux` | `wrong_result` | `win` | qmd can replay top-k rows, but the generated artifact does not expose service trace narration stages for the selected-but-not-narrated diagnosis.
command: `cargo make real-world-job-operator-ux-live-adapters`
artifact: `tmp/real-world-job/operator-ux-live-adapters/qmd-report.json` | +| `agentmemory_live_baseline` | `basic_same_corpus_retrieval` | `retrieval` | `pass` | `not_tested` | Fresh comparable baseline run live-baseline-20260611061612 reports agentmemory retrieval_pass with 3/3 same-corpus retrieval checks through mem::remember and mem::search. This is live-baseline-only evidence through an in-memory mock, not a real_world_job suite pass.
command: `ELF_BASELINE_PROJECTS=ELF,agentmemory,mem0,memsearch,claude-mem cargo make baseline-live-docker`
artifact: `tmp/live-baseline/live-baseline-report.json` | +| `agentmemory_live_baseline` | `durable_update_reload_lifecycle` | `memory_evolution` | `lifecycle_fail` | `win` | Fresh comparable baseline run live-baseline-20260611061612 reports ELF passing 8/8 local lifecycle checks, while agentmemory update_replaces_note_text is lifecycle_fail and cold_start_recovery_search is blocked because the harness uses an in-memory SDK/KV mock. This is an ELF baseline win only at the local lifecycle-smoke evidence class.
command: `ELF_BASELINE_PROJECTS=ELF,agentmemory,mem0,memsearch,claude-mem cargo make baseline-live-docker`
artifact: `tmp/live-baseline/live-baseline-report.json` | +| `agentmemory_live_baseline` | `work_resume_capture_continuity` | `work_resume` | `blocked` | `blocked` | agentmemory's relevant strength is durable coding-agent continuity and capture, but the Docker harness has not proven a persistent session/capture path. XY-925 selects the durable local path as a Docker-contained session directory that persists the SDK KV store and searchable index across a fresh process; keep work_resume and capture claims blocked until that path exists.
command: `cargo make real-world-first-generation-oss`
artifact: `tmp/real-world-memory/first-generation-oss/report.json` | +| `agentmemory_live_baseline` | `durable_work_resume_local_path` | `work_resume` | `blocked` | `blocked` | The selected comparable path is explicit: capture into a Docker-local agentmemory session directory, persist the SDK KV/index and observation log, restart a fresh process, then score work_resume prompts. The checked-in fixture records this as blocked rather than scoring the current mock.
command: `cargo make real-world-first-generation-oss`
artifact: `apps/elf-eval/fixtures/real_world_external_adapters/first_generation_oss/agentmemory_durable_capture_path_blocked.json` | +| `agentmemory_live_baseline` | `capture_write_policy_hooks` | `capture_integration` | `blocked` | `blocked` | agentmemory capture/write-policy comparison needs live hook observations and write-policy audit evidence persisted through the selected local store. The fixture preserves this as a typed blocker and does not convert the mem::remember smoke into capture proof.
command: `cargo make real-world-first-generation-oss`
artifact: `apps/elf-eval/fixtures/real_world_external_adapters/first_generation_oss/agentmemory_durable_capture_path_blocked.json` | +| `mem0_openmemory_live_baseline` | `basic_local_lifecycle` | `memory_evolution` | `pass` | `tie` | Prior comparable baseline run live-baseline-20260611061612 reports ELF passing 8/8 local lifecycle checks and mem0 passing basic same-corpus retrieval, update, delete, and cold-start reload checks. This remains a basic local lifecycle tie at the encoded smoke surface and is not reused as history/UI evidence.
command: `ELF_BASELINE_PROJECTS=ELF,agentmemory,mem0,memsearch,claude-mem cargo make baseline-live-docker`
artifact: `tmp/live-baseline/live-baseline-report.json` | +| `mem0_openmemory_live_baseline` | `preference_correction_history` | `personalization` | `pass` | `loss` | Fresh scoped baseline run live-baseline-20260611122416 reports mem0 preference_correction_history as pass. ELF-side evidence comes from cargo make real-world-memory-live-adapters as summarized in docs/evidence/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md, which records ELF live memory-evolution preference as wrong_result. The current measured comparison is therefore an ELF loss on this history dimension until ELF temporal reconciliation is fixed.
command: `mem0: ELF_BASELINE_PROJECTS=mem0 cargo make baseline-live-docker; ELF: cargo make real-world-memory-live-adapters`
artifact: `mem0: tmp/live-baseline/mem0-checks.json; ELF: tmp/real-world-memory/live-adapters/ and docs/evidence/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md` | +| `mem0_openmemory_live_baseline` | `entity_scoped_personalization` | `personalization` | `pass` | `tie` | Fresh scoped baseline run live-baseline-20260611122416 reports mem0 entity_scoped_personalization as pass. ELF-side evidence comes from cargo make real-world-memory-live-adapters as summarized in docs/evidence/benchmarking/2026-06-11-competitor-strength-adoption-report.md, which records ELF and qmd passing the encoded personalization slice. This is a measured tie on the current scoped-preference surface.
command: `mem0: ELF_BASELINE_PROJECTS=mem0 cargo make baseline-live-docker; ELF: cargo make real-world-memory-live-adapters`
artifact: `mem0: tmp/live-baseline/mem0-checks.json; ELF: tmp/real-world-memory/live-adapters/ and docs/evidence/benchmarking/2026-06-11-competitor-strength-adoption-report.md` | +| `mem0_openmemory_live_baseline` | `delete_audit_readback` | `memory_evolution` | `pass` | `tie` | Fresh scoped baseline run live-baseline-20260611122416 reports mem0 delete_history_audit_readback as pass. ELF-side evidence comes from cargo make real-world-memory-live-adapters as summarized in docs/evidence/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md, which records ELF passing the delete/TTL tombstone job. The current measured delete-audit comparison is a tie.
command: `mem0: ELF_BASELINE_PROJECTS=mem0 cargo make baseline-live-docker; ELF: cargo make real-world-memory-live-adapters`
artifact: `mem0: tmp/live-baseline/mem0-checks.json; ELF: tmp/real-world-memory/live-adapters/ and docs/evidence/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md` | +| `mem0_openmemory_live_baseline` | `local_get_all_export_readback` | `operator_debugging_ux` | `pass` | `not_tested` | Fresh scoped baseline run live-baseline-20260611122416 reports mem0 local_get_all_export_readback as pass. This is local SDK inspection/export-style readback, not OpenMemory UI evidence; ELF has no directly comparable live UI/export scoring row in this run.
command: `ELF_BASELINE_PROJECTS=mem0 cargo make baseline-live-docker`
artifact: `tmp/live-baseline/mem0-checks.json` | +| `mem0_openmemory_live_baseline` | `openmemory_ui_export_readback` | `operator_debugging_ux` | `blocked` | `blocked` | The XY-931 OpenMemory export-helper setup probe is Docker-contained in the mem0 baseline run. It detects the OpenMemory product tree, UI package, compose file, and export helper, but Docker is unavailable inside the baseline-runner container before the helper can reach a running OpenMemory product container or app database. Basic lifecycle and local SDK get_all readback are not reused as UI/export proof.
command: `cargo make openmemory-ui-export-readback`
artifact: `tmp/live-baseline/mem0-openmemory-ui-export.json` | +| `mem0_openmemory_live_baseline` | `hosted_platform_export` | `operator_debugging_ux` | `unsupported` | `non_goal` | Hosted mem0 Platform export is explicitly outside the local OSS Docker comparison and is not counted as a local pass, loss, or blocker.
artifact: `apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json` | +| `mem0_openmemory_live_baseline` | `optional_graph_memory` | `memory_evolution` | `not_encoded` | `non_goal` | Optional graph memory is kept as an opt-in scenario gate. It is not enabled in the default mem0 local OSS run and is not part of the default pass/fail comparison.
artifact: `apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json` | +| `memsearch_live_baseline` | `canonical_markdown_reindex_reload` | `trust_source_of_truth` | `pass` | `not_tested` | Fresh comparable baseline run live-baseline-20260611061612 reports memsearch passed same-corpus retrieval, update reindex, delete suppression, and cold-start reload over a canonical Markdown corpus. ELF has no directly comparable canonical Markdown source-store scenario in this baseline, so the ELF position remains untested.
command: `ELF_BASELINE_PROJECTS=ELF,agentmemory,mem0,memsearch,claude-mem cargo make baseline-live-docker`
artifact: `tmp/live-baseline/live-baseline-report.json` | +| `memsearch_live_baseline` | `markdown_source_store_rebuild_reload_prompt` | `trust_source_of_truth` | `pass` | `not_tested` | XY-925 adds a checked-in real_world_job prompt fixture that asks for the memsearch source-of-truth path and rebuild/reload boundary: canonical Markdown files are authoritative, while the index is derived by rerunning memsearch index. This is fixture-backed scenario coverage plus baseline artifact evidence, not a memsearch live real_world_job suite pass.
command: `cargo make real-world-first-generation-oss`
artifact: `apps/elf-eval/fixtures/real_world_external_adapters/first_generation_oss/memsearch_markdown_rebuild_reload.json` | +| `memsearch_live_baseline` | `markdown_retrieval_debug_prompt` | `operator_debugging_ux` | `pass` | `not_tested` | XY-925 adds a checked-in retrieval-debug prompt over memsearch's canonical Markdown store. The expected debug surface is CLI replay plus Markdown source inspection and reindexing; staged expansion/fusion/rerank/candidate-drop trace bundles remain not encoded for memsearch.
command: `cargo make real-world-first-generation-oss`
artifact: `apps/elf-eval/fixtures/real_world_external_adapters/first_generation_oss/memsearch_retrieval_debug_prompt.json` | +| `memsearch_live_baseline` | `ttl_expiry_lifecycle` | `memory_evolution` | `unsupported` | `non_goal` | The encoded memsearch CLI path supports reindex/delete but no TTL or expiry behavior. Unsupported TTL behavior is preserved as unsupported competitor evidence and does not create an ELF win/loss claim without a directly comparable scenario artifact.
artifact: `tmp/live-baseline/live-baseline-report.json` | +| `memsearch_live_baseline` | `real_world_prompt_adapter` | `retrieval` | `not_encoded` | `not_tested` | No live memsearch runtime adapter currently executes real_world_job prompts and answer scoring. XY-925 fixture-backed prompt jobs document the source-store and retrieval-debug shape, while baseline retrieval/reindex evidence remains separate from suite pass claims.
artifact: `apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json` | +| `claude_mem_live_baseline` | `same_corpus_retrieval` | `retrieval` | `wrong_result` | `win` | Fresh comparable baseline run live-baseline-20260611061612 reports ELF retrieval_pass and claude-mem same_corpus_retrieval as wrong_result with 0/3 expected query checks passing, while its durable repository setup completed. This is an ELF baseline win for the narrow retrieval smoke scenario.
command: `ELF_BASELINE_PROJECTS=ELF,agentmemory,mem0,memsearch,claude-mem cargo make baseline-live-docker`
artifact: `tmp/live-baseline/live-baseline-report.json` | +| `claude_mem_live_baseline` | `retrieval_repair_artifact_path` | `retrieval` | `wrong_result` | `win` | XY-925 adds a checked-in repair prompt that preserves the claude-mem wrong_result and names rerun/inspection targets from the reproducible Docker baseline: tmp/live-baseline/claude-mem.log and tmp/live-baseline/claude-mem-checks.json. This is repair evidence for a miss, not a retrieval pass.
command: `cargo make real-world-first-generation-oss`
artifact: `apps/elf-eval/fixtures/real_world_external_adapters/first_generation_oss/claude_mem_retrieval_repair.json` | +| `claude_mem_live_baseline` | `repository_lifecycle_reload` | `memory_evolution` | `pass` | `tie` | Fresh comparable baseline run live-baseline-20260611061612 reports ELF passing local lifecycle checks and claude-mem update, delete, and cold-start reload checks passing over a durable Docker-local SQLite repository. This is a local lifecycle-smoke tie, not a hook-driven work-resume or full progressive-disclosure job pass.
command: `ELF_BASELINE_PROJECTS=ELF,agentmemory,mem0,memsearch,claude-mem cargo make baseline-live-docker`
artifact: `tmp/live-baseline/live-baseline-report.json` | +| `claude_mem_live_baseline` | `progressive_disclosure_detail_hydration` | `operator_debugging_ux` | `pass` | `not_tested` | claude-mem passed the repository-level search-to-detail/source hydration check, which is a useful progressive-disclosure signal. ELF does not have a directly comparable claude-mem-style progressive-disclosure scenario in this baseline, so the ELF position remains untested rather than a loss claim.
command: `ELF_BASELINE_PROJECTS=ELF,agentmemory,mem0,memsearch,claude-mem cargo make baseline-live-docker`
artifact: `tmp/live-baseline/live-baseline-report.json` | +| `claude_mem_live_baseline` | `progressive_disclosure_prompt` | `operator_debugging_ux` | `pass` | `not_tested` | XY-925 adds fixture-backed prompt coverage that asks for the measured claude-mem progressive-disclosure boundary: repository search results hydrate through getById and listSources on durable SQLite, but hooks, timeline, viewer, and live prompt scoring are not executed.
command: `cargo make real-world-first-generation-oss`
artifact: `apps/elf-eval/fixtures/real_world_external_adapters/first_generation_oss/claude_mem_progressive_disclosure.json` | +| `claude_mem_live_baseline` | `hook_capture_viewer_workflow` | `capture_integration` | `blocked` | `blocked` | The Docker baseline uses repository classes only. claude-mem hooks, viewer, timeline, and observation workflows are not executed by the runner, so XY-925 preserves this as a typed blocker rather than not_encoded prose.
command: `cargo make real-world-first-generation-oss`
artifact: `apps/elf-eval/fixtures/real_world_external_adapters/first_generation_oss/claude_mem_hook_viewer_blocked.json` | +| `claude_mem_live_baseline` | `viewer_operator_workflow` | `operator_debugging_ux` | `blocked` | `blocked` | A fair claude-mem viewer/operator comparison needs a Docker-contained run that opens the local viewer or equivalent readback over the same durable SQLite corpus and emits timeline, detail hydration, and repair-command artifacts. That path is not available in the current runner.
command: `cargo make real-world-first-generation-oss`
artifact: `apps/elf-eval/fixtures/real_world_external_adapters/first_generation_oss/claude_mem_hook_viewer_blocked.json` | +| `ragflow_research_gate` | `reference_chunk_citation_mapping` | `retrieval` | `blocked` | `blocked` | XY-929 adds a representative blocked fixture for RAGFlow reference-chunk citation scoring. The job must remain blocked until returned reference chunks include generated document ids, chunk ids, content, and document metadata mapped to benchmark evidence ids.
command: `cargo make real-world-memory-graph-rag`
artifact: `apps/elf-eval/fixtures/real_world_external_adapters/graph_rag/ragflow_reference_chunks_blocked.json` | +| `ragflow_research_gate` | `retrieval_quality_reference_recall` | `retrieval` | `blocked` | `blocked` | XY-1071 keeps RAGFlow retrieval quality blocked until the same generated corpus returns answer text and selected reference chunks whose document ids, chunk ids, content, and metadata map to expected evidence ids; setup or API reachability alone is not retrieval quality evidence.
command: `cargo make real-world-memory-graph-rag`
artifact: `apps/elf-eval/fixtures/real_world_external_adapters/graph_rag/ragflow_reference_chunks_blocked.json` | +| `ragflow_research_gate` | `navigation_quality_document_chunks` | `retrieval` | `blocked` | `blocked` | RAGFlow document/chunk navigation remains blocked until returned references expose stable document metadata plus chunk identifiers that can be followed back to same-corpus source evidence.
command: `cargo make real-world-memory-graph-rag`
artifact: `apps/elf-eval/fixtures/real_world_external_adapters/graph_rag/ragflow_reference_chunks_blocked.json` | +| `ragflow_research_gate` | `answer_faithfulness_reference_chunks` | `retrieval` | `blocked` | `blocked` | RAGFlow answer faithfulness is blocked until generated answers can be checked against returned reference chunk content and decoy/stale chunks are absent from cited support.
command: `cargo make real-world-memory-graph-rag`
artifact: `apps/elf-eval/fixtures/real_world_external_adapters/graph_rag/ragflow_reference_chunks_blocked.json` | +| `ragflow_research_gate` | `stale_source_behavior` | `retrieval` | `not_encoded` | `not_tested` | RAGFlow stale-source replacement, invalidation, or lint behavior is not encoded by the current same-corpus reference-chunk blocker; no stale-source quality claim is made.
artifact: `apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json` | +| `ragflow_research_gate` | `knowledge_compilation_quality` | `knowledge_compilation` | `not_encoded` | `not_tested` | RAGFlow knowledge compilation quality is not scored because no checked-in same-corpus RAGFlow page, section, citation, or stale-source lint artifact exists.
artifact: `apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json` | +| `ragflow_research_gate` | `private_or_large_corpus_ragflow_quality` | `retrieval` | `not_encoded` | `non_goal` | Private corpus, large-corpus, and hosted RAGFlow quality are outside the generated-public Docker representative lane and must not be inferred from smoke reports.
artifact: `apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json` | +| `lightrag_research_gate` | `context_source_reference_mapping` | `retrieval` | `incomplete` | `blocked` | XY-929 adds a representative incomplete fixture for LightRAG context/source-reference scoring. The job cannot score until the opt-in Docker API exports generated source file paths, snippets, or reference content.
command: `cargo make real-world-memory-graph-rag`
artifact: `apps/elf-eval/fixtures/real_world_external_adapters/graph_rag/lightrag_context_sources_incomplete.json` | +| `lightrag_research_gate` | `retrieval_quality_context_recall` | `retrieval` | `incomplete` | `blocked` | XY-1071 keeps LightRAG retrieval quality incomplete until the opt-in Docker API exports same-corpus context or references that can be joined to expected evidence ids; service startup alone is not a retrieval-quality result.
command: `cargo make real-world-memory-graph-rag`
artifact: `apps/elf-eval/fixtures/real_world_external_adapters/graph_rag/lightrag_context_sources_incomplete.json` | +| `lightrag_research_gate` | `citation_quality_context_references` | `retrieval` | `incomplete` | `blocked` | LightRAG citation quality is incomplete until returned context, references.file_path, references.content, or equivalent source snippets map to generated evidence ids.
command: `cargo make real-world-memory-graph-rag`
artifact: `apps/elf-eval/fixtures/real_world_external_adapters/graph_rag/lightrag_context_sources_incomplete.json` | +| `lightrag_research_gate` | `navigation_quality_graph_context` | `retrieval` | `incomplete` | `blocked` | LightRAG graph/context navigation remains incomplete until exported context exposes source paths or graph-derived source snippets that can be followed back to same-corpus evidence.
command: `cargo make real-world-memory-graph-rag`
artifact: `apps/elf-eval/fixtures/real_world_external_adapters/graph_rag/lightrag_context_sources_incomplete.json` | +| `lightrag_research_gate` | `answer_faithfulness_context_refs` | `retrieval` | `incomplete` | `blocked` | LightRAG answer faithfulness is incomplete until generated answers and only_need_context output can be checked for required evidence, decoy exclusion, and source-reference alignment.
command: `cargo make real-world-memory-graph-rag`
artifact: `apps/elf-eval/fixtures/real_world_external_adapters/graph_rag/lightrag_context_sources_incomplete.json` | +| `lightrag_research_gate` | `stale_source_behavior` | `retrieval` | `not_encoded` | `not_tested` | LightRAG stale-source replacement, invalidation, or lint behavior is not encoded by the current context-source blocker; no stale-source quality claim is made.
artifact: `apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json` | +| `lightrag_research_gate` | `knowledge_compilation_quality` | `knowledge_compilation` | `not_encoded` | `not_tested` | LightRAG knowledge compilation quality is not scored because no checked-in same-corpus page, section, citation, or stale-source lint artifact exists.
artifact: `apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json` | +| `lightrag_research_gate` | `graph_rag_navigation_quality` | `retrieval` | `not_encoded` | `not_tested` | LightRAG graph-RAG navigation quality remains not_tested beyond the context-source output contract; no ELF win, tie, or loss is claimed.
artifact: `apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json` | +| `graphrag_research_gate` | `output_table_citation_mapping` | `knowledge_compilation` | `blocked` | `blocked` | XY-929 adds a representative blocked fixture for GraphRAG output-table citation scoring. The job requires provider-backed Docker output tables whose document, text-unit, community, report, entity, and relationship identifiers map to generated evidence ids.
command: `cargo make real-world-memory-graph-rag`
artifact: `apps/elf-eval/fixtures/real_world_external_adapters/graph_rag/graphrag_output_tables_blocked.json` | +| `graphrag_research_gate` | `retrieval_quality_local_search` | `retrieval` | `not_encoded` | `not_tested` | XY-1071 keeps GraphRAG retrieval quality not tested because the current smoke records output-table and local-search reachability contracts but does not score same-corpus retrieval answers beyond mapped output prerequisites.
artifact: `apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json` | +| `graphrag_research_gate` | `navigation_quality_community_graph` | `knowledge_compilation` | `blocked` | `blocked` | GraphRAG community/entity/relationship navigation remains blocked until provider-backed output tables expose community, entity, relationship, text-unit, and document identifiers that map to generated evidence ids.
command: `cargo make real-world-memory-graph-rag`
artifact: `apps/elf-eval/fixtures/real_world_external_adapters/graph_rag/graphrag_output_tables_blocked.json` | +| `graphrag_research_gate` | `answer_faithfulness_output_tables` | `knowledge_compilation` | `blocked` | `blocked` | GraphRAG answer faithfulness is blocked until summaries or local-search answers can be checked against mapped documents, text units, and community report rows while excluding unsupported or stale claims.
command: `cargo make real-world-memory-graph-rag`
artifact: `apps/elf-eval/fixtures/real_world_external_adapters/graph_rag/graphrag_output_tables_blocked.json` | +| `graphrag_research_gate` | `stale_source_behavior` | `knowledge_compilation` | `not_encoded` | `not_tested` | GraphRAG stale-source replacement, invalidation, or lint behavior is not encoded by the current output-table blocker; no stale-source quality claim is made.
artifact: `apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json` | +| `graphrag_research_gate` | `graph_summary_synthesis_quality` | `knowledge_compilation` | `not_encoded` | `not_tested` | GraphRAG graph-summary synthesis quality remains not_tested until provider-backed output tables and local-search context are scored beyond the smoke contract.
artifact: `apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json` | +| `graphiti_zep_research_gate` | `temporal_validity_window_mapping` | `memory_evolution` | `blocked` | `blocked` | XY-929 adds a representative blocked fixture for Graphiti/Zep temporal-validity scoring. The job remains blocked until provider-backed Docker output maps current and historical validity-window facts to generated evidence ids.
command: `cargo make real-world-memory-graph-rag`
artifact: `apps/elf-eval/fixtures/real_world_external_adapters/graph_rag/graphiti_temporal_validity_blocked.json` | +| `graphiti_zep_research_gate` | `hosted_zep_temporal_memory` | `memory_evolution` | `unsupported` | `non_goal` | Hosted Zep service behavior is outside the Docker-local representative lane; no hosted-service result is used as ELF win/loss evidence.
artifact: `apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json` | +| `letta_research_gate` | `core_block_attachment_readback` | `core_archival_memory` | `blocked` | `blocked` | ELF fixture core-archival-core-block-attachment-001 scores exact core block attachment and keeps core readback out of Qdrant-backed archival search. Letta remains blocked until the generated export/readback artifact maps this core block attachment source id.
command: `cargo make smoke-letta-core-archive-export-readback`
artifact: `tmp/real-world-memory/letta-core-archive/summary.json` | +| `letta_research_gate` | `core_block_scope_readback` | `core_archival_memory` | `blocked` | `blocked` | ELF fixture core-archival-core-block-scope-001 scores read_profile, shared scope, and private-owner boundaries. Letta scope behavior remains blocked until the generated export includes agent, block, visibility metadata, and source ids.
command: `cargo make smoke-letta-core-archive-export-readback`
artifact: `tmp/real-world-memory/letta-core-archive/summary.json` | +| `letta_research_gate` | `core_block_provenance_readback` | `core_archival_memory` | `blocked` | `blocked` | ELF fixture core-archival-core-block-provenance-001 scores source_ref and audit_history readback. Letta provenance remains blocked until exported core memory includes stable source ids and audit-equivalent events.
command: `cargo make smoke-letta-core-archive-export-readback`
artifact: `tmp/real-world-memory/letta-core-archive/summary.json` | +| `letta_research_gate` | `stale_core_detection` | `core_archival_memory` | `blocked` | `blocked` | ELF fixture core-archival-stale-core-detection-001 scores archival evidence superseding a stale core block. Letta stale-core comparison is blocked until core export and archival readback can be joined by source ids.
command: `cargo make smoke-letta-core-archive-export-readback`
artifact: `tmp/real-world-memory/letta-core-archive/summary.json` | +| `letta_research_gate` | `archival_fallback_readback` | `core_archival_memory` | `blocked` | `blocked` | ELF fixture core-archival-archival-fallback-001 scores fallback from insufficient core memory to archival note search. Letta fallback comparison is blocked until archival search output can be exported with source ids.
command: `cargo make smoke-letta-core-archive-export-readback`
artifact: `tmp/real-world-memory/letta-core-archive/summary.json` | +| `letta_research_gate` | `core_archival_project_decision_recovery` | `core_archival_memory` | `blocked` | `blocked` | ELF fixture core-archival-project-decision-recovery-001 scores core routing plus archival decision rationale. Letta project-decision recovery remains blocked until the generated export/readback artifact maps core routing plus archival rationale source ids.
command: `cargo make smoke-letta-core-archive-export-readback`
artifact: `tmp/real-world-memory/letta-core-archive/summary.json` | +| `llm_wiki_research_gate` | `wiki_page_citation_lint` | `knowledge_compilation` | `not_encoded` | `not_tested` | llm-wiki remains a knowledge-workflow reference. No Docker-contained plugin or file-based page materializer emits cited wiki sections for scoring.
artifact: `apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json` | +| `gbrain_research_gate` | `compiled_truth_timeline_export` | `knowledge_compilation` | `blocked` | `blocked` | gbrain compiled-truth and timeline scoring remains blocked until a Docker-local brain repository and database setup emits current-truth pages with source timeline evidence.
artifact: `apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json` | +| `graphify_docker_smoke` | `graph_report_navigation_lint` | `knowledge_compilation` | `wrong_result` | `not_tested` | XY-929 adds a representative graphify fixture that scores graph report navigation, source-location citations, stale-source lint, and unsupported-summary handling as wrong_result because stale-source lint is still missing. This remains graphify non-pass evidence, not an ELF victory claim.
command: `cargo make real-world-memory-graph-rag`
artifact: `apps/elf-eval/fixtures/real_world_external_adapters/graph_rag/graphify_graph_report_wrong_result.json` | +| `graphify_docker_smoke` | `broad_graph_navigation_quality` | `retrieval` | `not_encoded` | `not_tested` | Broad graph-navigation, codebase, multimodal, and private-corpus quality remain not_tested; the graphify evidence is bounded to generated graph/report artifacts.
artifact: `apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json` | + +### Adapter Execution Metadata + +| Adapter | Sources | Setup Path | Runtime Boundary | Resource Expectation | Retry Guidance | Research Depth | +| --- | --- | --- | --- | --- | --- | --- | +| `openviking_live_baseline` | [OpenViking repository](https://github.com/volcengine/OpenViking/): Official source for OpenViking local context database, resource, and retrieval APIs.
[llama-cpp-python CPU wheel index](https://abetlen.github.io/llama-cpp-python/whl/cpu): Official prebuilt CPU wheel index used by the Docker-local embedding pin. | Run ELF_BASELINE_PROJECTS=OpenViking cargo make baseline-live-docker. The runner installs llama-cpp-python==0.3.28 with --only-binary llama-cpp-python from the CPU wheel index before OpenViking add_resource/find. | docker-compose.baseline.yml baseline-runner container; no host-global OpenViking, llama-cpp-python, or model service install is required. | Local embedding setup may download a CPU wheel and model assets; record OpenViking.log, elapsed time, and cache size before claiming adapter quality. | Use the default pinned CPU wheel path first.; Override ELF_BASELINE_OPENVIKING_LLAMA_CPP_PYTHON_VERSION or ELF_BASELINE_OPENVIKING_LLAMA_CPP_PYTHON_INDEX only when the default wheel is unavailable for the Docker platform.; Treat install/import failure as incomplete, not wrong_result; treat add_resource/find evidence misses as wrong_result. | not recorded | +| `qmd_deep_profile_gate` | [qmd repository](https://github.com/tobi/qmd): Official qmd source for local hybrid search, CLI setup, and query behavior. | Use the existing Docker baseline qmd install, collection add, update, embed, and query flow with scale or stress profiles. | docker-compose.baseline.yml baseline-runner container with project files and caches inside Docker volumes. | CPU local embedding and rerank cost scale with corpus size; record elapsed time and qmd log artifacts before claims. | Run qmd stress profile in Docker and publish the artifact path.; Map qmd JSON output to retrieval-debug real_world_job scoring before suite claims. | D2 reviewed; deep profile not encoded | +| `openviking_deep_profile_gate` | [OpenViking repository](https://github.com/volcengine/OpenViking/): Official source for OpenViking local context database, resource, and retrieval APIs. | Use the pinned Docker local embedding path from scripts/live-baseline-benchmark.sh, then run OpenViking add_resource/find before any deep profile scoring. | docker-compose.baseline.yml baseline-runner container; no host model or compiler setup outside Docker. | Local embedding setup can download CPU wheels and model assets; record build/import logs, model cache size, and elapsed time. | Run the default pinned llama-cpp-python==0.3.28 CPU wheel path first.; Override the OpenViking llama-cpp-python version or index only when the default wheel is unavailable for the Docker platform.; Fix evidence-bearing same-corpus output and materialize selected hierarchy/expansion artifacts before converting blocked context_trajectory fixtures into scored jobs. | D2 reviewed; local embedding setup pinned; blocked fixtures encoded | +| `ragflow_research_gate` | [RAGFlow repository](https://github.com/infiniflow/ragflow): Official source for RAGFlow service code and Docker Compose setup.
[RAGFlow docs](https://ragflow.io/docs/): Official deployment and setup documentation.
[RAGFlow HTTP API reference](https://raw.githubusercontent.com/infiniflow/ragflow/main/docs/references/http_api_reference.md): Official reference for OpenAI-compatible responses with reference chunks and document metadata. | Implement a tiny Docker evidence-smoke runner using the official Docker deployment, dataset ingest API, and OpenAI-compatible query API. | Run scripts/ragflow-docker-evidence-smoke.sh through cargo make; the live path uses the official RAGFlow Docker Compose service boundary without host-global RAGFlow installs. | Large multi-service RAG stack; generated artifacts record CPU/GPU mode, memory, disk, image size, expanded disk notes, startup time, vm.max_map_count handling, and provider boundaries before scoring. | Run cargo make smoke-ragflow-docker first to produce a typed preflight artifact.; Start the live path only with ELF_RAGFLOW_SMOKE_START=1 and ELF_RAGFLOW_SMOKE_ACCEPT_RESOURCE_ENVELOPE=1.; Keep private corpora and operator-owned provider credentials out of this smoke; map only generated public corpus reference chunks to evidence ids. | D2 feasibility verdict plus XY-885 evidence-smoke implementation and XY-900 scored smoke promotion; checked-in record remains research_gate unless a generated artifact reaches query output | +| `lightrag_research_gate` | [LightRAG repository](https://github.com/HKUDS/LightRAG): Official source for LightRAG server, Docker, and retrieval modes.
[LightRAG Docker docs](https://github.com/HKUDS/LightRAG/blob/main/docs/DockerDeployment.md): Official Docker deployment reference.
[LightRAG API server docs](https://github.com/HKUDS/LightRAG/blob/main/docs/LightRAG-API-Server.md): Official query-mode and context-output reference.
[LightRAG core programming docs](https://github.com/HKUDS/LightRAG/blob/main/docs/ProgramingWithCore.md): Official source-id and file-path citation reference. | Run cargo make smoke-lightrag-docker-context for a typed preflight artifact; set ELF_LIGHTRAG_CONTEXT_START=1 to start the opt-in LightRAG Docker profile and attempt live context export. | docker-compose.baseline.yml baseline-runner plus opt-in lightrag and lightrag-mock-provider services; generated source files and LightRAG data stay in Docker-mounted artifact paths and Docker volumes. | The default profile uses the official LightRAG image, a local OpenAI-compatible mock provider, 64-dimensional embeddings, rerank disabled for context queries, cargo/pip/Hugging Face caches, and Docker volumes for rag_storage, inputs, and prompts. | Run cargo make smoke-lightrag-docker-context first; a missing API must remain a typed incomplete artifact, not a pass claim.; Set ELF_LIGHTRAG_CONTEXT_START=1 only when Docker may pull/start the LightRAG service profile.; Score retrieval only when returned context, references.file_path, or references.content map to required evidence ids. | D2 feasibility plus XY-886 context-export implementation and XY-900 scored smoke aggregation; checked-in record remains research_gate unless a generated artifact reaches query output | +| `graphrag_research_gate` | [GraphRAG repository](https://github.com/microsoft/graphrag): Official Microsoft GraphRAG source and setup reference.
[GraphRAG docs](https://microsoft.github.io/graphrag/): Official documentation for indexing and querying.
[GraphRAG input docs](https://microsoft.github.io/graphrag/index/inputs/): Official input format and document metadata reference.
[GraphRAG output tables](https://microsoft.github.io/graphrag/index/outputs/): Official output schema with document, text unit, community, and relationship identifiers.
[GraphRAG local search docs](https://microsoft.github.io/graphrag/query/local_search/): Official local-search context and graph traversal reference. | Run cargo make smoke-graphrag-docker for a typed preflight artifact; set ELF_GRAPHRAG_SMOKE_RUN=1 with explicit provider configuration for a live GraphRAG index/query attempt. | docker-compose.baseline.yml baseline-runner, container-local Python venv, generated public corpus, and report artifacts under tmp/real-world-memory/graphrag-smoke. | The default profile uses a generated public corpus capped by ELF_GRAPHRAG_MAX_DOCS and ELF_GRAPHRAG_MAX_INPUT_CHARS, pins GraphRAG through ELF_GRAPHRAG_PACKAGE, and records elapsed time, cache size, output size, and observed cache entries. | Run cargo make smoke-graphrag-docker first; missing provider configuration must remain a typed blocked artifact, not a pass claim.; Enable ELF_GRAPHRAG_SMOKE_RUN=1 only for generated public corpus indexing with explicit provider configuration.; Fail typed if source document or text_unit identifiers cannot be mapped to expected evidence IDs. | D2 feasibility plus XY-887 Docker smoke implementation and XY-900 scored smoke promotion; checked-in record remains research_gate unless a generated artifact reaches GraphRAG output | +| `graphiti_zep_research_gate` | [Graphiti repository](https://github.com/getzep/graphiti): Official open-source temporal context graph engine.
[Zep Graphiti overview](https://www.getzep.com/platform/graphiti/): Official product documentation for temporal context graph behavior.
[Graphiti quick start](https://help.getzep.com/graphiti/getting-started/quick-start): Official setup, episode ingest, and search output reference.
[Graphiti FalkorDB configuration](https://help.getzep.com/graphiti/configuration/falkor-db-configuration): Official Docker-local FalkorDB setup reference.
[Graphiti fact triples](https://help.getzep.com/graphiti/working-with-data/adding-fact-triples): Official manual fact-triple ingest contract. | Run cargo make smoke-graphiti-zep-docker-temporal for a typed artifact; set ELF_GRAPHITI_ZEP_SMOKE_START=1 ELF_GRAPHITI_ZEP_SMOKE_RUN=1 with explicit provider configuration for a live attempt. | docker-compose.baseline.yml baseline-runner plus graphiti-zep FalkorDB profile, container-local Python venv, generated public temporal facts, and report artifacts under tmp/real-world-memory/graphiti-zep-smoke. | Requires Docker-local FalkorDB plus LLM/embedding configuration; generated artifacts record service startup, storage size, provider boundaries, fact count, and timeout before scoring. | Run cargo make smoke-graphiti-zep-docker-temporal first to produce a typed blocked artifact.; Start the live path only with ELF_GRAPHITI_ZEP_SMOKE_START=1, ELF_GRAPHITI_ZEP_SMOKE_RUN=1, and explicit provider configuration.; Treat missing validity windows or unmapped current/historical facts as wrong_result, not pass. | D2 feasibility plus XY-888 Docker temporal smoke implementation and XY-900 scored smoke promotion; checked-in record remains research_gate unless a generated artifact reaches Graphiti search output | +| `letta_research_gate` | [Letta Docker docs](https://docs.letta.com/guides/docker): Official Docker setup and explicit embedding configuration boundary.
[Letta Python API](https://docs.letta.com/api/python): Official Python SDK memory block creation and retrieval examples.
[Letta archival search API](https://docs.letta.com/api/resources/agents/subresources/passages/methods/search): Official archival-memory search endpoint contract. | Run cargo make smoke-letta-core-archive-export-readback for a typed artifact; set ELF_LETTA_SMOKE_START=1 ELF_LETTA_SMOKE_RUN=1 with explicit model/provider configuration for a live export attempt. The smoke exports core block JSON plus archival search/readback JSON when Letta setup succeeds. | docker-compose.baseline.yml baseline-runner plus optional Letta server profile, benchmark-created agent, benchmark-owned fixture corpus, no hosted/private state, and artifacts under tmp/real-world-memory/letta-core-archive. | Letta Docker server, Python SDK client, explicit model and embedding configuration, exported core memory, archival search output, and provider boundaries must be explicit in the artifact. | Default command records a typed blocked artifact without model calls.; Enable the live path only with Docker-local Letta and explicit provider or local model configuration.; Score core-versus-archival scenarios only after core block export and archival list/search output map to fixture evidence ids. | D1 feasibility verdict: research_only (XY-882); XY-927 selected the contained export/readback contract; XY-984 adds the Docker-contained materializer and keeps the comparison blocked until live export evidence maps source ids. | +| `langgraph_research_gate` | [LangGraph persistence docs](https://docs.langchain.com/oss/python/langgraph/persistence): Official documentation for checkpoints, replay, fork, and persistence behavior. | Build a tiny LangGraph agent with a checkpointer and explicit memory read/write steps before scoring. | Docker-only Python harness with checkpoint store under the artifact directory. | Small runtime expected, but LLM calls and side effects must be stubbed or deterministic before replay claims. | Encode one replay/fork failure recovery job.; Keep LangGraph classified as replay reference unless memory retrieval is actually exercised. | D1 feasibility verdict: research_only (XY-882); replay/checkpoint reference, adapter not encoded | +| `nanograph_research_gate` | [nanograph repository](https://github.com/nanograph/nanograph): Official source for on-device typed property graph behavior. | Build or install nanograph inside Docker and load a typed graph fixture from generated corpus facts. | Docker-only CLI run with graph folder under benchmark artifacts. | Light local graph runtime expected; record binary build/install time and graph artifact size. | Define a minimal schema for memory_evolution facts.; Score typed query output only if it cites fixture evidence IDs. | D1 feasibility verdict: research_only (XY-882); typed graph DX reference, adapter not encoded | +| `llm_wiki_research_gate` | [llm-wiki repository](https://github.com/nvk/llm-wiki): Official source for the LLM Wiki plugin and knowledge-base workflow. | Research plugin bootstrap inside a Docker-contained Codex or file-based harness, then materialize page artifacts. | Docker-only plugin or fixture materializer; no user-global Codex plugin install. | LLM generation cost depends on page build; record provider boundary and generated artifact size. | Prototype a fixture-only page build with explicit citations.; Do not score until generated sections can be mapped to evidence IDs. | D1 feasibility verdict: research_only (XY-882); derived wiki workflow reference, adapter not encoded | +| `gbrain_research_gate` | [gbrain repository](https://github.com/garrytan/gbrain): Official source for brain repo and retrieval workflow.
[compiled truth guide](https://github.com/garrytan/gbrain/blob/master/docs/guides/compiled-truth.md): Official guide for compiled truth plus timeline behavior. | Create a Docker-local brain repo fixture, run import/sync, and export compiled truth plus timeline evidence. | Docker-only repository and database state with no operator-owned brain repo. | Postgres-backed sync and embedding choices must be explicit; record DB size and import time. | Prototype a tiny brain repo with one current-truth page and timeline.; Score only if compiled truth cites the source timeline evidence. | D1 feasibility verdict: blocked (XY-882); Docker-local brain repo and database path not proven | +| `graphify_docker_smoke` | [graphify repository](https://github.com/safishamsi/graphify): Official source for graphify graph extraction and query workflow.
[graphify README](https://github.com/safishamsi/graphify/blob/v3/README.md): Official CLI, output artifact, query, and source-location contract. | Run cargo make smoke-graphify-docker-graph-report to install graphify in Docker, build graph/report artifacts from a generated public corpus, and export query evidence without installing host-global assistant hooks. | docker-compose.baseline.yml baseline-runner, container-local Python venv, isolated HOME/config paths, generated public corpus, and artifacts under tmp/real-world-memory/graphify-smoke. | Graph build cost scales with corpus and model choices; generated artifacts record package reference, provider/model boundary, build time, graph size, report size, cache size, timeout, and retry behavior. | Run cargo make smoke-graphify-docker-graph-report first; setup/runtime failures must remain typed artifacts, not pass claims.; Do not use graphify host assistant hook installs or operator-owned assistant configuration as proof.; Score graph-guided answers only when graph.json, GRAPH_REPORT.md, and graphify query output map to generated evidence ids. | D1 feasibility verdict plus XY-889 Docker graph/report smoke implementation and XY-900 scored smoke promotion; current Docker validation reaches graphify output and scores the tiny knowledge_compilation job as wrong_result | + +## Capture And Integration Coverage + +The real-world job runner is fixture-backed. This section separates encoded evidence from live adapter claims. + +| Class | Behaviors | +| --- | --- | +| real | - | +| fixture-backed | - | +| mocked | - | +| blocked | - | +| not encoded | No capture/integration behavior was declared by encoded fixtures. | + +## Suites + +| Suite | Status | Jobs | Score | Evidence Recall | Irrelevant Context | Trace Explain | Stale Answers | Conflicts | Update Rationales | Temporal Gaps | History Readback | Unsupported Claims | Wrong Results | Reason | +| --- | --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | --- | +| trust_source_of_truth | `not_encoded` | 0 | `-` | `-` | `-` | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | No checked-in real_world_job fixture is encoded for this suite. | +| work_resume | `not_encoded` | 0 | `-` | `-` | `-` | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | No checked-in real_world_job fixture is encoded for this suite. | +| project_decisions | `not_encoded` | 0 | `-` | `-` | `-` | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | No checked-in real_world_job fixture is encoded for this suite. | +| retrieval | `not_encoded` | 0 | `-` | `-` | `-` | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | No checked-in real_world_job fixture is encoded for this suite. | +| memory_evolution | `not_encoded` | 0 | `-` | `-` | `-` | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | No checked-in real_world_job fixture is encoded for this suite. | +| adversarial_quality | `not_encoded` | 0 | `-` | `-` | `-` | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | No checked-in real_world_job fixture is encoded for this suite. | +| consolidation | `not_encoded` | 0 | `-` | `-` | `-` | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | No checked-in real_world_job fixture is encoded for this suite. | +| memory_summary | `not_encoded` | 0 | `-` | `-` | `-` | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | No checked-in real_world_job fixture is encoded for this suite. | +| proactive_brief | `not_encoded` | 0 | `-` | `-` | `-` | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | No checked-in real_world_job fixture is encoded for this suite. | +| scheduled_memory | `not_encoded` | 0 | `-` | `-` | `-` | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | No checked-in real_world_job fixture is encoded for this suite. | +| knowledge_compilation | `not_encoded` | 0 | `-` | `-` | `-` | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | No checked-in real_world_job fixture is encoded for this suite. | +| source_library | `not_encoded` | 0 | `-` | `-` | `-` | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | No checked-in real_world_job fixture is encoded for this suite. | +| operator_debugging_ux | `not_encoded` | 0 | `-` | `-` | `-` | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | No checked-in real_world_job fixture is encoded for this suite. | +| capture_integration | `not_encoded` | 0 | `-` | `-` | `-` | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | No checked-in real_world_job fixture is encoded for this suite. | +| production_ops | `blocked` | 7 | `0.714` | `1.000` | `0.000` | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | At least one encoded job is blocked. | +| personalization | `not_encoded` | 0 | `-` | `-` | `-` | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | No checked-in real_world_job fixture is encoded for this suite. | +| core_archival_memory | `not_encoded` | 0 | `-` | `-` | `-` | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | No checked-in real_world_job fixture is encoded for this suite. | +| context_trajectory | `not_encoded` | 0 | `-` | `-` | `-` | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | No checked-in real_world_job fixture is encoded for this suite. | + +## Jobs + +| Suite | Job | Status | Answer Type | Caveat Required | Refusal Required | Unknown Allowed | Score | Evidence Recall | Irrelevant Context | Expected Evidence | Produced Evidence | Trace Failure Stage | Stale Answers | Conflicts | Update Rationale | Temporal Gap | Unsupported Claims | Wrong Results | Latency | Cost | +| --- | --- | --- | --- | --- | --- | --- | ---: | ---: | ---: | --- | --- | --- | ---: | ---: | --- | --- | ---: | ---: | ---: | --- | +| production_ops | production-ops-restore-cold-start-001 | `pass` | `direct_answer` | `false` | `false` | `false` | `1.000` | `1.000` | `0.000` | `restore-search-before, restore-qdrant-rebuild, restore-search-after` | `restore-qdrant-rebuild, restore-search-after, restore-search-before` | `-` | 0 | 0 | `false` | `false` | 0 | 0 | `2.100 ms` | `0.000 USD` | +| production_ops | production-ops-cold-start-dependency-001 | `pass` | `direct_answer` | `false` | `false` | `true` | `1.000` | `1.000` | `0.000` | `pinned-local-embed-runtime-reached, pinned-local-embed-retry, openviking-wrong-result-behavior, typed-incomplete-policy` | `openviking-wrong-result-behavior, pinned-local-embed-retry, pinned-local-embed-runtime-reached, typed-incomplete-policy` | `-` | 0 | 0 | `false` | `false` | 0 | 0 | `1.800 ms` | `0.000 USD` | +| production_ops | production-ops-credential-boundary-001 | `blocked` | `direct_answer` | `true` | `false` | `true` | `0.000` | `1.000` | `0.000` | `provider-credential-boundary, checked-in-secret-boundary` | `checked-in-secret-boundary, provider-credential-boundary` | `-` | 0 | 0 | `false` | `false` | 0 | 0 | `1.700 ms` | `0.000 USD` | +| production_ops | production-ops-backfill-resume-001 | `pass` | `direct_answer` | `false` | `false` | `false` | `1.000` | `1.000` | `0.000` | `backfill-checkpoint-state, backfill-clean-compare` | `backfill-checkpoint-state, backfill-clean-compare` | `-` | 0 | 0 | `false` | `false` | 0 | 0 | `2.000 ms` | `0.000 USD` | +| production_ops | production-ops-private-manifest-blocked-001 | `blocked` | `direct_answer` | `true` | `false` | `true` | `0.000` | `1.000` | `0.000` | `private-manifest-guard, private-bounded-failure-policy` | `private-bounded-failure-policy, private-manifest-guard` | `-` | 0 | 0 | `false` | `false` | 0 | 0 | `1.600 ms` | `0.000 USD` | +| production_ops | production-ops-public-proxy-addendum-001 | `pass` | `direct_answer` | `true` | `false` | `false` | `1.000` | `1.000` | `0.000` | `public-proxy-addendum-pass, public-proxy-latency-resource-cost, public-proxy-claim-boundary` | `public-proxy-addendum-pass, public-proxy-claim-boundary, public-proxy-latency-resource-cost` | `-` | 0 | 0 | `false` | `false` | 0 | 0 | `10.843 ms` | `0.000 USD` | +| production_ops | production-ops-resource-envelope-001 | `pass` | `direct_answer` | `false` | `false` | `false` | `1.000` | `1.000` | `0.000` | `resource-envelope-check, large-import-planning-caveat` | `large-import-planning-caveat, resource-envelope-check` | `-` | 0 | 0 | `false` | `false` | 0 | 0 | `2.300 ms` | `0.000 USD` | + +## Operator Debugging UX + +No encoded job reported operator debugging evidence. + +## Memory Evolution + +- Stale answers: `0` +- Conflict detections: `0` +- Update rationales available: `0` +- Temporal validity not encoded: `0` + +- History readback encoded: `0` + +| Suite | Job | Current Evidence | Historical Evidence | Tombstone/Invalidation | Selected Current | Selected Historical | Selected Rationale | Selected Tombstone/Invalidation | Selected But Not Narrated | Stale Traps Used | Conflict Count | Detected | Update Rationale | Temporal Validity | History Readback | Follow-up | +| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | ---: | ---: | --- | --- | --- | --- | + +## Trace Explainability + +No encoded job reported trace explainability metadata. + +## Unsupported Claims + +No unsupported claims were produced by encoded jobs. + +## Follow-Ups + +| Suite | Job | Follow-up | Reason | +| --- | --- | --- | --- | +| production_ops | production-ops-credential-boundary-001 | Run provider-backed production-ops gate with routed operator credentials | Credential-bound checks need an operator shell with provider environment variables; fixture reports can only encode the boundary. | +| production_ops | production-ops-private-manifest-blocked-001 | Supply an operator-owned private production corpus manifest | A real private-corpus pass requires a sanitized local manifest supplied outside checked-in fixtures. | + +## Result Semantics + +This report uses `docs/spec/real_world_agent_memory_benchmark_v1.md` status terms. +It is a real-world job fixture report, not a Docker live-baseline report. +Existing live-baseline reports remain valid for their encoded retrieval and lifecycle checks and are not reinterpreted as real-world suite wins. + +The summary counters report required evidence coverage, source-ref coverage, quote coverage, expected evidence recall, irrelevant context ratio, trace explainability, stale retrievals, scope violations, redaction leaks, Qdrant rebuild case coverage, stale answers, conflict detections, update rationale availability, and temporal validity gaps across encoded jobs. + +- `pass`: encoded jobs met their pass threshold with required evidence and no hard-fail rule. +- `wrong_result`: a job completed but missed required answer or evidence expectations. +- `incomplete`: the runner or adapter did not reach the behavioral check. +- `blocked`: required credentials, private input, product runtime, or host integration is outside the run scope. +- `not_tested`: a comparison row or report slice has no executed benchmark evidence. +- `unsupported_claim`: a job produced a substantive claim not supported by the fixture evidence links. +- `not_encoded`: a suite has no checked-in fixture, or an encoded fixture declares a capability gap so no pass/fail claim is allowed. +- `fixture_backed`: checked-in fixtures were scored; no live product execution is implied. +- `live_baseline`: Docker live-baseline retrieval or lifecycle evidence exists, but it is not a real-world suite pass by itself. +- `live_real_world`: a live adapter ran the real-world job contract and reported typed outcomes. +- `research_gate`: research, setup, source mapping, or resource gates are recorded before a fair benchmark can run. + +Any `wrong_result`, `incomplete`, `blocked`, `not_tested`, `not_encoded`, `unsupported_claim`, or non-live evidence class must remain visible and must not be counted as a win. + +For `knowledge_compilation` jobs, generated pages are benchmark artifacts. Page sections must cite source evidence or timeline events, or be explicitly flagged as unsupported. Flagged unsupported summaries are counted separately from hidden unsupported claims. + +For `source_library` jobs, saved long-form material and social/thread captures are source records, not durable Memory Notes. Source records must preserve canonical source metadata, source_ref hydration pointers, and explicit promotion boundaries before any memory write is claimed. + +For `memory_summary` jobs, summary artifacts are derived review surfaces. Top-of-mind entries must be current, included or downgraded entries must carry source refs, and derived project-profile entries must either cite sources or be explicitly flagged as unsupported. + +For `proactive_brief` jobs, brief artifacts are fixture-scored derived outputs, not scheduled UI behavior. Every suggestion must carry evidence refs, freshness/currentness metadata, and an action rationale; stale, superseded, or tombstoned sources must not be presented as current recommendations. + +For `scheduled_memory` jobs, task artifacts are deterministic fixture-scored stand-ins for asynchronous work. Every output must carry evidence refs, freshness/currentness metadata, action rationale, and execution trace/readback evidence; scheduled tasks must not mutate source notes silently or claim hosted scheduler/private-provider parity from fixture-only output. + +## Suites With `not_encoded` Status + +- `trust_source_of_truth` +- `work_resume` +- `project_decisions` +- `retrieval` +- `memory_evolution` +- `adversarial_quality` +- `consolidation` +- `memory_summary` +- `proactive_brief` +- `scheduled_memory` +- `knowledge_compilation` +- `source_library` +- `operator_debugging_ux` +- `capture_integration` +- `personalization` +- `core_archival_memory` +- `context_trajectory` diff --git a/docs/evidence/benchmarking/index.md b/docs/evidence/benchmarking/index.md index 58cb526f..bc0ab4f6 100644 --- a/docs/evidence/benchmarking/index.md +++ b/docs/evidence/benchmarking/index.md @@ -56,3 +56,4 @@ Routes to: Benchmarking evidence concepts under `docs/evidence/benchmarking/`. - `2026-06-23-temporal-trajectory-adapter-coverage-report.md`: Temporal and Trajectory Adapter Coverage Report - June 23, 2026; refreshes Graphiti/Zep temporal-validity and OpenViking context-trajectory adapter evidence with trace-stage typed blockers, source ids, and explicit no-parity boundaries. - `2026-06-23-graph-rag-adapter-matrix-report.md`: Graph/RAG Adapter Matrix Report - June 23, 2026; adds manifest-backed RAGFlow, GraphRAG, and LightRAG rows for retrieval, citation, navigation, stale-source behavior, answer faithfulness, and knowledge compilation while preserving 0 pass rows and no graph/RAG parity claim. - `2026-06-23-p3-competitor-strength-absorption-report.md`: P3 Competitor-Strength Absorption Report - June 23, 2026; closes XY-1072 by naming which qmd, PageIndex/OpenKB, mem0/OpenMemory, Letta, Graphiti/Zep, OpenViking, RAGFlow, GraphRAG, and LightRAG strengths ELF absorbed, which remain stronger elsewhere or blocked, and which P4 optimization queue items are ready for main-thread inspection without applying a queue label. +- `2026-06-23-p4-production-readiness-evidence-gates-report.md`: P4 Production-Readiness Evidence Gates Report - June 23, 2026; adds `cargo make real-world-memory-p4-production-readiness`, records latency, cost, resource, cold-start, restore, and Qdrant rebuild evidence, separates local fixture, public-proxy, private-corpus, and provider-backed tiers, and preserves private/provider inputs as typed blockers. diff --git a/docs/runbook/benchmarking/real_world_agent_memory_benchmark.md b/docs/runbook/benchmarking/real_world_agent_memory_benchmark.md index b93f03b3..064f0213 100644 --- a/docs/runbook/benchmarking/real_world_agent_memory_benchmark.md +++ b/docs/runbook/benchmarking/real_world_agent_memory_benchmark.md @@ -187,8 +187,9 @@ including the retrieval-quality slice below. The suite currently encodes: source-id preservation, evidence binding, no secret leakage, and fixture-backed capture/integration boundary classification. - `production_ops`: interrupted generated backfill resume, backup/restore plus - cold-start readback, resource-envelope interpretation, pinned OpenViking local - embedding runtime/wrong-result classification, missing private manifest `blocked` + cold-start readback, resource-envelope interpretation, public-proxy + production-private addendum readback, pinned OpenViking local embedding + runtime/wrong-result classification, missing private manifest `blocked` classification, and provider credential boundary `blocked` classification. - `personalization`: scoped stable preference correction without temporary or cross-project preference leakage. @@ -214,10 +215,12 @@ including the retrieval-quality slice below. The suite currently encodes: The generated report includes the public quality scoreboard `elf.quality_scoreboard/v1`, encoded-job and external-adapter typed non-pass counts/states, aggregate typed non-pass counts/states, evidence-class counts, bounded -job and aggregate summary claims, the unqualified-win guard, evidence coverage, -source-ref coverage, quote coverage, unsupported-claim count, stale retrieval count, -stale-answer count, conflict detection count, update rationale availability, temporal -validity encoding count, scope correctness, redaction leak count, capture/integration +job and aggregate summary claims, the unqualified-win guard, operational evidence +gates with `local_fixture`, `public_proxy`, `private_corpus`, and `provider_backed` +tiers, evidence coverage, source-ref coverage, quote coverage, unsupported-claim +count, stale retrieval count, stale-answer count, conflict detection count, update +rationale availability, temporal validity encoding count, scope correctness, +redaction leak count, capture/integration behavior classes, Qdrant rebuild case/pass counts, expected evidence recall, irrelevant context ratio, latency/cost, answer-type plus caveat/refusal/uncertainty flags, trace explainability counters, production-ops @@ -363,8 +366,8 @@ The public quality scoreboard renders the existing manifest evidence bucket external adapter manifest is loaded, the scoreboard's typed non-pass count includes adapter coverage and scenario rows as well as fixture jobs. -Current fixture state: `cargo make real-world-memory-json` covers 72 jobs across 18 -suites, with 65 pass and 7 blocked. The adversarial quality slice contributes five +Current fixture state: `cargo make real-world-memory-json` covers 73 jobs across 18 +suites, with 66 pass and 7 blocked. The adversarial quality slice contributes five passing fixture-backed jobs that exercise stale fact suppression, unsupported-claim refusal, source-authority conflicts, private-span exclusion, and correction persistence. The P1 closeout fixture slice contributes four passing jobs for @@ -657,11 +660,19 @@ Current checked-in production-ops increment: cargo make real-world-memory-production-ops ``` +Current P4 production-readiness evidence-gate slice: + +```sh +cargo make real-world-memory-p4-production-readiness +``` + Artifacts: ```text tmp/real-world-memory/production-ops-report.json tmp/real-world-memory/production-ops-report.md +tmp/real-world-memory/p4-production-readiness/report.json +tmp/real-world-memory/p4-production-readiness/report.md ``` The production-ops fixtures live under @@ -669,7 +680,9 @@ The production-ops fixtures live under readback over existing public benchmark and restore evidence: interrupted backfill resume from checkpoint, clean-run comparison, backup/restore readback, Qdrant rebuild from Postgres-held vectors, cold-start search recovery, and resource-envelope -interpretation. +interpretation. The P4 slice also encodes the operator-approved public-proxy +production-private addendum and emits `elf.operational_evidence_gates/v1` so local +fixture, public-proxy, private-corpus, and provider-backed evidence remain separate. The same slice deliberately keeps non-pass boundaries typed. A missing private production manifest is `blocked`, unavailable provider credentials are `blocked`, and @@ -680,6 +693,10 @@ import on a Docker platform, that setup boundary remains `incomplete`. These sta are evidence for operator caveats, not proof of private-corpus, provider-backed production, or external-adapter quality success. +Public-proxy passes are useful production-readiness signals, but they do not satisfy a +real private-corpus gate. Local-hash or fixture-backed cost/latency records are +operational accounting evidence, not hosted provider-spend or provider-quality proof. + This suite does not run private corpus data, does not require or publish credentials, does not perform live Docker restore/backfill work, and does not reinterpret older live-baseline reports as real-world production-ops wins. For personal production diff --git a/docs/spec/real_world_agent_memory_benchmark_v1.md b/docs/spec/real_world_agent_memory_benchmark_v1.md index 01360c73..936028d4 100644 --- a/docs/spec/real_world_agent_memory_benchmark_v1.md +++ b/docs/spec/real_world_agent_memory_benchmark_v1.md @@ -149,7 +149,7 @@ runner execution. | `encoding` | object | Optional job-level limitation declaration. Only `not_encoded`, `blocked`, and `incomplete` statuses are allowed here. | | `memory_evolution` | object or null | Optional for most suites; used by `memory_evolution` jobs to report current evidence, historical evidence, stale traps, conflicts, update rationale, and temporal-validity limitations. | | `memory_summary` | object or null | Optional for most suites; used by `memory_summary` jobs to report reviewable summary/source-trace metrics defined in `system_memory_summary_v1.md`. | -| `tags` | array | Optional labels such as `private_corpus`, `synthetic`, `adapter_required`, or `no_live_claim`. | +| `tags` | array | Optional labels such as `private_corpus`, `public_proxy`, `provider_backed`, `synthetic`, `adapter_required`, or `no_live_claim`. | ### `corpus` @@ -678,6 +678,14 @@ Reports MUST include: counts, visible typed non-pass states for each bucket and the aggregate report, evidence-class counts, bounded job and aggregate summary claims, and an explicit unqualified-win guard; +- operational evidence gates using schema `elf.operational_evidence_gates/v1`, + separating `local_fixture`, `public_proxy`, `private_corpus`, and + `provider_backed` tiers. The gates MUST report tier status, job counts, pass and + typed non-pass counts, mean latency, cost summary, resource-envelope counts, + cold-start/restore/Qdrant-rebuild counts, typed blocker reasons, and explicit + booleans for whether private-corpus or provider-backed pass claims are allowed. + Local fixture and public-proxy passes MUST NOT satisfy private-corpus or + provider-backed proof. - run id, runner version, corpus profile, job ids, suite ids, project adapter metadata; - per-job status, normalized score, hard-fail hits, evidence ids used, trap ids used; - per-job `answer_type`, required caveat/refusal flags, and whether an unknown answer @@ -747,6 +755,10 @@ claiming scheduled provider-backed generation. - A project MAY claim a suite pass only for suites with encoded jobs and a published report using this contract. - A project MUST NOT use generated public jobs to claim private production readiness. +- A project MUST NOT use local-hash, mock-provider, fixture-only, or public-proxy + evidence to claim provider-backed production behavior. Missing private manifests or + provider credentials MUST remain typed `blocked`, `incomplete`, or `not_encoded` + rows with visible blocker reasons. - A project MUST NOT treat `blocked`, `incomplete`, or `not_encoded` as evidence of weakness or strength; those states only describe benchmark coverage. - A project MUST NOT claim "best memory system" from this suite. Reports SHOULD describe