Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
52 changes: 52 additions & 0 deletions Makefile.toml
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,9 @@
# | real-world-memory-p1-closeout | composite | |
# | real-world-memory-p1-closeout-json | command | |
# | real-world-memory-p1-closeout-report | command | |
# | real-world-memory-p4-production-readiness | composite | |
# | real-world-memory-p4-production-readiness-json | command | |
# | real-world-memory-p4-production-readiness-report | command | |
# | real-world-memory-p2-knowledge-closeout | composite | |
# | real-world-memory-core-archival | composite | |
# | real-world-memory-core-archival-json | command | |
Expand Down Expand Up @@ -429,6 +432,55 @@ args = [
"tmp/real-world-memory/p1-closeout/report.md",
]

[tasks.real-world-memory-p4-production-readiness]
workspace = false
dependencies = [
"real-world-memory-p4-production-readiness-report",
]

[tasks.real-world-memory-p4-production-readiness-json]
workspace = false
command = "cargo"
args = [
"run",
"-p",
"elf-eval",
"--bin",
"real_world_job_benchmark",
"--",
"run",
"--fixtures",
"apps/elf-eval/fixtures/real_world_memory/production_ops",
"--out",
"tmp/real-world-memory/p4-production-readiness/report.json",
"--run-id",
"real-world-memory-p4-production-readiness",
"--adapter-id",
"fixture_production_ops",
"--adapter-name",
"ELF P4 production-readiness fixture",
]

[tasks.real-world-memory-p4-production-readiness-report]
workspace = false
dependencies = [
"real-world-memory-p4-production-readiness-json",
]
command = "cargo"
args = [
"run",
"-p",
"elf-eval",
"--bin",
"real_world_job_benchmark",
"--",
"publish",
"--report",
"tmp/real-world-memory/p4-production-readiness/report.json",
"--out",
"tmp/real-world-memory/p4-production-readiness/report.md",
]

[tasks.real-world-memory-p2-knowledge-closeout]
workspace = false
dependencies = [
Expand Down
13 changes: 12 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -308,6 +308,14 @@ provider-backed ELF evidence was required.
OpenViking trajectory, and graph/RAG citation/navigation remain optimization inputs
or typed blockers. The report makes P4 queue items inspectable but applies no
`decodex:queued:elf` label.
- P4 production-readiness evidence gates after XY-1074: the June 23 follow-up adds
`cargo make real-world-memory-p4-production-readiness`, a checked-in evidence
report, and `elf.operational_evidence_gates/v1`. The production-ops slice scores
7 jobs, 5 pass, 0 wrong_result, 0 incomplete, and 2 typed blockers while separating
local fixture, public-proxy, private-corpus, and provider-backed tiers. It records
latency, cost, resource, cold-start, restore, and Qdrant rebuild metrics, but keeps
missing private-corpus manifests and provider credentials as typed blockers rather
than private/provider-backed pass proof.
- Operator-approved public-proxy addendum after XY-930: the June 19 follow-up runs
`cargo make baseline-production-private-addendum` with a simulated/public-proxy
production corpus manifest approved for this stage. The run records 12 documents,
Expand Down Expand Up @@ -451,6 +459,7 @@ Detailed evidence and interpretation:
- [Temporal and Trajectory Adapter Coverage Report - June 23, 2026](docs/evidence/benchmarking/2026-06-23-temporal-trajectory-adapter-coverage-report.md)
- [Graph/RAG Adapter Matrix Report - June 23, 2026](docs/evidence/benchmarking/2026-06-23-graph-rag-adapter-matrix-report.md)
- [P3 Competitor-Strength Absorption Report - June 23, 2026](docs/evidence/benchmarking/2026-06-23-p3-competitor-strength-absorption-report.md)
- [P4 Production-Readiness Evidence Gates Report - June 23, 2026](docs/evidence/benchmarking/2026-06-23-p4-production-readiness-evidence-gates-report.md)
- [Live Baseline Benchmark Runbook](docs/runbook/benchmarking/live_baseline_benchmark.md)
- [Single-User Production Runbook](docs/runbook/single_user_production.md)
- Benchmark contract:
Expand Down Expand Up @@ -547,6 +556,7 @@ Detailed comparison, mechanism-level analysis, and source map:
- [Temporal and Trajectory Adapter Coverage Report - June 23, 2026](docs/evidence/benchmarking/2026-06-23-temporal-trajectory-adapter-coverage-report.md)
- [Graph/RAG Adapter Matrix Report - June 23, 2026](docs/evidence/benchmarking/2026-06-23-graph-rag-adapter-matrix-report.md)
- [P3 Competitor-Strength Absorption Report - June 23, 2026](docs/evidence/benchmarking/2026-06-23-p3-competitor-strength-absorption-report.md)
- [P4 Production-Readiness Evidence Gates Report - June 23, 2026](docs/evidence/benchmarking/2026-06-23-p4-production-readiness-evidence-gates-report.md)
- [Live Baseline Benchmark Runbook](docs/runbook/benchmarking/live_baseline_benchmark.md)
- [Real-World Agent Memory Benchmark](docs/runbook/benchmarking/real_world_agent_memory_benchmark.md)
- [External Memory Improvement Plan](docs/evidence/external_memory/external_memory_improvement_plan.md)
Expand All @@ -566,7 +576,8 @@ Report - June 20, 2026, and the Live Knowledge-Page Rebuild/Lint Report - June 2
Workspace PageIndex/OpenKB Closeout Report, PageIndex/OpenKB Same-Corpus Adapter
Report, and mem0/OpenMemory and Letta Memory-History/Core-Archive Adapter Report;
June 23 adds the Temporal and Trajectory Adapter Coverage Report, the Graph/RAG
Adapter Matrix Report, and the P3 Competitor-Strength Absorption Report after the
Adapter Matrix Report, the P3 Competitor-Strength Absorption Report, and the P4
Production-Readiness Evidence Gates Report after the
June 19 XY-930 operator-approved public-proxy production addendum and service-native
Dreaming readback, the qmd debug-ergonomics Dreaming retest, the June 17
competitor-strength closeout, and the June 16 temporal reconciliation, live
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -195,5 +195,12 @@
"acceptable_phrases": ["blocked until operator credentials are supplied", "must not require user secrets"],
"fallback_action": "state_blocker"
},
"tags": ["external_adapter", "production_ops", "credential_boundary", "blocked", "no_live_claim"]
"tags": [
"external_adapter",
"provider_backed",
"production_ops",
"credential_boundary",
"blocked",
"no_live_claim"
]
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,242 @@
{
"schema": "elf.real_world_job/v1",
"job_id": "production-ops-public-proxy-addendum-001",
"suite": "production_ops",
"title": "Separate operator-approved public-proxy evidence from private and provider proof",
"corpus": {
"corpus_id": "real-world-memory-production-ops-2026-06-19-public-proxy",
"profile": "generated_public",
"items": [
{
"evidence_id": "public-proxy-addendum-pass",
"kind": "benchmark_report",
"text": "The operator-approved public-proxy production-private addendum passed 8/8 query checks with 0 wrong_result, 0 lifecycle_fail, 0 blocked, 0 incomplete, and 0 not_encoded.",
"source_ref": {
"schema": "source_ref/v1",
"resolver": "real_world_job_fixture/v1",
"ref": {
"fixture": "public_proxy_production_private_addendum",
"evidence_id": "public-proxy-addendum-pass"
},
"locator": {
"quote": "passed 8/8 query checks with 0 wrong_result"
}
},
"created_at": "2026-06-19T14:40:13Z"
},
{
"evidence_id": "public-proxy-latency-resource-cost",
"kind": "benchmark_report",
"text": "The public-proxy addendum recorded query mean latency 10.842727625 ms, P95 30.443385 ms, elapsed 1.313984156 seconds, RSS 37656 KB, and 386 estimated input tokens with no configured cost rate.",
"source_ref": {
"schema": "source_ref/v1",
"resolver": "real_world_job_fixture/v1",
"ref": {
"fixture": "public_proxy_production_private_addendum",
"evidence_id": "public-proxy-latency-resource-cost"
},
"locator": {
"quote": "query mean latency 10.842727625 ms"
}
},
"created_at": "2026-06-19T14:40:13Z"
},
{
"evidence_id": "public-proxy-claim-boundary",
"kind": "decision",
"text": "The public-proxy addendum is not real private-corpus production proof and does not prove provider-backed production quality because embedding mode was local-hash.",
"source_ref": {
"schema": "source_ref/v1",
"resolver": "real_world_job_fixture/v1",
"ref": {
"fixture": "public_proxy_production_private_addendum",
"evidence_id": "public-proxy-claim-boundary"
},
"locator": {
"quote": "not real private-corpus production proof"
}
},
"created_at": "2026-06-19T14:40:13Z"
},
{
"evidence_id": "public-proxy-private-proof-decoy",
"kind": "decision",
"text": "Decoy: the public-proxy addendum proves real private-corpus production quality and provider-backed embedding quality.",
"source_ref": {
"schema": "source_ref/v1",
"resolver": "real_world_job_fixture/v1",
"ref": {
"fixture": "public_proxy_production_private_addendum",
"evidence_id": "public-proxy-private-proof-decoy"
}
},
"created_at": "2026-06-19T14:40:13Z"
}
],
"adapter_response": {
"adapter_id": "fixture_production_ops",
"answer": {
"content": "The operator-approved public-proxy addendum passed 8/8 query checks with 0 wrong_result. It recorded query mean latency 10.842727625 ms, P95 30.443385 ms, elapsed 1.313984156 seconds, RSS 37656 KB, and 386 estimated input tokens. This is public-proxy evidence only: it is not real private-corpus production proof and does not prove provider-backed production quality because embedding mode was local-hash.",
"claims": [
{
"claim_id": "public_proxy_passed",
"text": "The operator-approved public-proxy addendum passed 8/8 query checks with 0 wrong_result.",
"evidence_ids": [
"public-proxy-addendum-pass"
],
"confidence": "high"
},
{
"claim_id": "public_proxy_operational_envelope",
"text": "The public-proxy addendum recorded latency, resource, and token cost-proxy metrics.",
"evidence_ids": [
"public-proxy-latency-resource-cost"
],
"confidence": "high"
},
{
"claim_id": "public_proxy_boundary",
"text": "The public-proxy addendum is not private-corpus or provider-backed production proof.",
"evidence_ids": [
"public-proxy-claim-boundary"
],
"confidence": "high"
}
],
"evidence_ids": [
"public-proxy-addendum-pass",
"public-proxy-latency-resource-cost",
"public-proxy-claim-boundary"
],
"latency_ms": 10.842727625,
"cost": {
"currency": "USD",
"amount": 0.0,
"input_tokens": 386,
"output_tokens": 0
}
}
}
},
"negative_traps": [
{
"trap_id": "public-proxy-as-private-proof",
"type": "unsupported_claim",
"evidence_ids": [
"public-proxy-private-proof-decoy"
],
"failure_if_used": true
}
],
"prompt": {
"role": "user",
"content": "What operational evidence did the public-proxy production-private addendum add, and what must it not be called?",
"job_mode": "operate",
"constraints": [
"cite_evidence",
"record_latency_cost_resource",
"separate_public_proxy_from_private_provider_proof"
]
},
"expected_answer": {
"must_include": [
{
"claim_id": "public_proxy_passed",
"text": "The operator-approved public-proxy addendum passed 8/8 query checks with 0 wrong_result."
},
{
"claim_id": "public_proxy_operational_envelope",
"text": "The public-proxy addendum recorded latency, resource, and token cost-proxy metrics."
},
{
"claim_id": "public_proxy_boundary",
"text": "The public-proxy addendum is not private-corpus or provider-backed production proof."
}
],
"must_not_include": [
"proves real private-corpus production quality",
"provider-backed embedding quality"
],
"evidence_links": {
"public_proxy_passed": [
"public-proxy-addendum-pass"
],
"public_proxy_operational_envelope": [
"public-proxy-latency-resource-cost"
],
"public_proxy_boundary": [
"public-proxy-claim-boundary"
]
},
"answer_type": "direct_answer",
"accepted_alternates": [],
"requires_caveat": true,
"requires_refusal": false
},
"required_evidence": [
{
"evidence_id": "public-proxy-addendum-pass",
"claim_id": "public_proxy_passed",
"requirement": "cite",
"quote": "passed 8/8 query checks with 0 wrong_result"
},
{
"evidence_id": "public-proxy-latency-resource-cost",
"claim_id": "public_proxy_operational_envelope",
"requirement": "cite",
"quote": "query mean latency 10.842727625 ms"
},
{
"evidence_id": "public-proxy-claim-boundary",
"claim_id": "public_proxy_boundary",
"requirement": "cite",
"quote": "not real private-corpus production proof"
}
],
"scoring_rubric": {
"dimensions": {
"latency_resource": {
"weight": 0.3,
"max_points": 1.0,
"criteria": {
"max_latency_ms": 50.0,
"resource_expectation": "Report public-proxy latency, resource, and token cost-proxy metrics."
}
},
"evidence_grounding": {
"weight": 0.3,
"max_points": 1.0,
"criteria": "Cites public-proxy pass, operational envelope, and claim-boundary evidence."
},
"trap_avoidance": {
"weight": 0.25,
"max_points": 1.0,
"criteria": "Does not convert public-proxy evidence into private or provider proof."
},
"workflow_helpfulness": {
"weight": 0.15,
"max_points": 1.0,
"criteria": "Explains how to use the proxy evidence safely."
}
},
"pass_threshold": 0.85,
"hard_fail_rules": [
"unsupported high-confidence claim about a required decision or fact",
"use of a negative trap marked failure_if_used = true"
]
},
"allowed_uncertainty": {
"can_answer_unknown": false,
"acceptable_phrases": [
"not real private-corpus production proof"
],
"fallback_action": "state_blocker"
},
"tags": [
"generated_public",
"public_proxy",
"production_ops",
"resource_envelope",
"no_live_claim"
]
}
Loading