From a611c3e2ecd5d63a9672f97f2ee6f15d93f4adaa Mon Sep 17 00:00:00 2001 From: Yvette Carlisle Date: Tue, 23 Jun 2026 05:01:49 +0800 Subject: [PATCH] {"schema":"decodex/commit/1","summary":"Add quality scoreboard grammar and adversarial benchmark gates","authority":"XY-1073"} --- Makefile.toml | 52 +++ .../conflicting_source_authority.json | 232 ++++++++++++ .../correction_persistence.json | 283 +++++++++++++++ .../private_excluded_span.json | 213 +++++++++++ .../stale_fact_current_answer.json | 207 +++++++++++ .../unsupported_claim_refusal.json | 150 ++++++++ .../src/bin/real_world_job_benchmark.rs | 265 +++++++++++++- .../tests/real_world_job_benchmark.rs | 342 +++++++++++++++++- .../real_world_agent_memory_benchmark.md | 76 +++- .../real_world_agent_memory_benchmark_v1.md | 44 +++ 10 files changed, 1837 insertions(+), 27 deletions(-) create mode 100644 apps/elf-eval/fixtures/real_world_memory/adversarial_quality/conflicting_source_authority.json create mode 100644 apps/elf-eval/fixtures/real_world_memory/adversarial_quality/correction_persistence.json create mode 100644 apps/elf-eval/fixtures/real_world_memory/adversarial_quality/private_excluded_span.json create mode 100644 apps/elf-eval/fixtures/real_world_memory/adversarial_quality/stale_fact_current_answer.json create mode 100644 apps/elf-eval/fixtures/real_world_memory/adversarial_quality/unsupported_claim_refusal.json diff --git a/Makefile.toml b/Makefile.toml index d21ee0e8..e11d12b9 100644 --- a/Makefile.toml +++ b/Makefile.toml @@ -22,6 +22,9 @@ # | real-world-job-operator-ux-live-adapters | command | | # | real-world-job-operator-ux-report | command | | # | real-world-memory | composite | | +# | real-world-memory-adversarial-quality | composite | | +# | real-world-memory-adversarial-quality-json | command | | +# | real-world-memory-adversarial-quality-report | command | | # | real-world-memory-consolidation | composite | | # | real-world-memory-consolidation-json | command | | # | real-world-memory-consolidation-report | command | | @@ -279,6 +282,55 @@ dependencies = [ "real-world-memory-report", ] +[tasks.real-world-memory-adversarial-quality] +workspace = false +dependencies = [ + "real-world-memory-adversarial-quality-report", +] + +[tasks.real-world-memory-adversarial-quality-json] +workspace = false +command = "cargo" +args = [ + "run", + "-p", + "elf-eval", + "--bin", + "real_world_job_benchmark", + "--", + "run", + "--fixtures", + "apps/elf-eval/fixtures/real_world_memory/adversarial_quality", + "--out", + "tmp/real-world-memory/adversarial-quality/report.json", + "--run-id", + "real-world-memory-adversarial-quality", + "--adapter-id", + "fixture_adversarial_quality", + "--adapter-name", + "ELF adversarial quality fixture", +] + +[tasks.real-world-memory-adversarial-quality-report] +workspace = false +dependencies = [ + "real-world-memory-adversarial-quality-json", +] +command = "cargo" +args = [ + "run", + "-p", + "elf-eval", + "--bin", + "real_world_job_benchmark", + "--", + "publish", + "--report", + "tmp/real-world-memory/adversarial-quality/report.json", + "--out", + "tmp/real-world-memory/adversarial-quality/report.md", +] + [tasks.real-world-memory-consolidation] workspace = false dependencies = [ diff --git a/apps/elf-eval/fixtures/real_world_memory/adversarial_quality/conflicting_source_authority.json b/apps/elf-eval/fixtures/real_world_memory/adversarial_quality/conflicting_source_authority.json new file mode 100644 index 00000000..459657e9 --- /dev/null +++ b/apps/elf-eval/fixtures/real_world_memory/adversarial_quality/conflicting_source_authority.json @@ -0,0 +1,232 @@ +{ + "schema": "elf.real_world_job/v1", + "job_id": "adversarial-quality-conflicting-source-authority-001", + "suite": "adversarial_quality", + "title": "Prefer authoritative current source over conflicting stale source", + "corpus": { + "corpus_id": "real-world-memory-adversarial-quality-2026-06-23", + "profile": "synthetic", + "items": [ + { + "evidence_id": "old-provider-note", + "kind": "note", + "text": "Old provider note: the production embedding dimension is 1536 for the checked-in adoption gate.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "adversarial_quality/conflicting_source_authority", + "evidence_id": "old-provider-note" + } + }, + "created_at": "2026-06-09T03:00:00Z" + }, + { + "evidence_id": "current-provider-report", + "kind": "decision", + "text": "Authoritative current report: the production provider path uses Qwen3-Embedding-8B with 4096-dimensional embeddings for provider-backed ELF evidence.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "adversarial_quality/conflicting_source_authority", + "evidence_id": "current-provider-report" + } + }, + "created_at": "2026-06-23T01:20:00Z" + }, + { + "evidence_id": "authority-ordering-rule", + "kind": "runbook", + "text": "Authority rule: when benchmark notes conflict, use the latest checked-in report with source refs and keep the older note as historical evidence only.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "adversarial_quality/conflicting_source_authority", + "evidence_id": "authority-ordering-rule" + } + }, + "created_at": "2026-06-23T01:21:00Z" + } + ], + "adapter_response": { + "adapter_id": "fixture_adversarial_quality", + "answer": { + "content": "Use the authoritative current report: provider-backed ELF evidence uses Qwen3-Embedding-8B with 4096-dimensional embeddings. The older 1536-dimensional note is historical because the authority rule says to prefer the latest checked-in report with source refs.", + "claims": [ + { + "claim_id": "current_provider_dimension", + "text": "Provider-backed ELF evidence uses Qwen3-Embedding-8B with 4096-dimensional embeddings.", + "evidence_ids": [ + "current-provider-report", + "old-provider-note", + "authority-ordering-rule" + ], + "confidence": "high" + }, + { + "claim_id": "authority_rationale", + "text": "The older 1536-dimensional note is historical because the latest checked-in report with source refs is authoritative.", + "evidence_ids": ["authority-ordering-rule"], + "confidence": "high" + } + ], + "evidence_ids": [ + "current-provider-report", + "old-provider-note", + "authority-ordering-rule" + ], + "latency_ms": 1.2, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + } + } + } + }, + "timeline": [ + { + "event_id": "old-provider-note-recorded", + "ts": "2026-06-09T03:00:00Z", + "actor": "agent", + "action": "recorded_old_provider_note", + "evidence_ids": ["old-provider-note"], + "summary": "An older provider dimension note was recorded." + }, + { + "event_id": "current-provider-report-published", + "ts": "2026-06-23T01:20:00Z", + "actor": "agent", + "action": "published_current_provider_report", + "evidence_ids": ["current-provider-report", "authority-ordering-rule"], + "summary": "The current report and authority rule superseded the older note." + } + ], + "prompt": { + "role": "user", + "content": "Which embedding dimension should the provider-backed benchmark claim use when old notes conflict with the current report?", + "job_mode": "decide", + "constraints": ["cite_evidence", "prefer_authoritative_current_report", "preserve_historical_context"] + }, + "expected_answer": { + "must_include": [ + { + "claim_id": "current_provider_dimension", + "text": "Provider-backed ELF evidence uses Qwen3-Embedding-8B with 4096-dimensional embeddings." + }, + { + "claim_id": "authority_rationale", + "text": "The older 1536-dimensional note is historical because the latest checked-in report with source refs is authoritative." + } + ], + "must_not_include": [ + "the production embedding dimension is 1536" + ], + "evidence_links": { + "current_provider_dimension": [ + "current-provider-report", + "old-provider-note", + "authority-ordering-rule" + ], + "authority_rationale": ["authority-ordering-rule"] + }, + "answer_type": "decision_record", + "accepted_alternates": [], + "requires_caveat": false, + "requires_refusal": false + }, + "required_evidence": [ + { + "evidence_id": "current-provider-report", + "claim_id": "current_provider_dimension", + "requirement": "cite", + "quote": "4096-dimensional embeddings" + }, + { + "evidence_id": "authority-ordering-rule", + "claim_id": "authority_rationale", + "requirement": "explain", + "quote": "use the latest checked-in report" + } + ], + "negative_traps": [ + { + "trap_id": "old-provider-dimension-current", + "type": "stale_fact", + "evidence_ids": ["old-provider-note"], + "failure_if_used": false + } + ], + "scoring_rubric": { + "dimensions": { + "answer_correctness": { + "weight": 0.25, + "max_points": 1.0, + "criteria": "Selects the current provider dimension." + }, + "evidence_grounding": { + "weight": 0.25, + "max_points": 1.0, + "criteria": "Cites current report and authority ordering evidence." + }, + "lifecycle_behavior": { + "weight": 0.25, + "max_points": 1.0, + "criteria": "Keeps older conflicting evidence historical instead of current." + }, + "trap_avoidance": { + "weight": 0.15, + "max_points": 1.0, + "criteria": "Does not promote the stale 1536-dimensional note." + }, + "uncertainty_handling": { + "weight": 0.1, + "max_points": 1.0, + "criteria": "States the authority boundary without overclaiming provider quality." + } + }, + "pass_threshold": 0.85, + "hard_fail_rules": [ + "unsupported high-confidence claim about a required decision or fact", + "use of a negative trap marked failure_if_used = true" + ] + }, + "allowed_uncertainty": { + "can_answer_unknown": false, + "acceptable_phrases": [], + "fallback_action": "state_blocker" + }, + "memory_evolution": { + "current_evidence_ids": ["current-provider-report"], + "historical_evidence_ids": ["old-provider-note"], + "stale_trap_ids": ["old-provider-dimension-current"], + "conflicts": [ + { + "conflict_id": "provider-dimension-authority-conflict", + "claim_id": "current_provider_dimension", + "current_evidence_id": "current-provider-report", + "historical_evidence_id": "old-provider-note", + "resolved_by_evidence_id": "authority-ordering-rule" + } + ], + "update_rationale": { + "claim_id": "authority_rationale", + "evidence_ids": ["authority-ordering-rule"], + "available": true + }, + "temporal_validity": { + "required": false, + "encoded": false, + "follow_up": null + }, + "history_readback": { + "encoded": false, + "required_event_types": [], + "requires_note_version_links": false + } + }, + "tags": ["synthetic", "adversarial_quality", "conflicting_source_authority", "current_authority", "no_live_claim"] +} diff --git a/apps/elf-eval/fixtures/real_world_memory/adversarial_quality/correction_persistence.json b/apps/elf-eval/fixtures/real_world_memory/adversarial_quality/correction_persistence.json new file mode 100644 index 00000000..95c62ca0 --- /dev/null +++ b/apps/elf-eval/fixtures/real_world_memory/adversarial_quality/correction_persistence.json @@ -0,0 +1,283 @@ +{ + "schema": "elf.real_world_job/v1", + "job_id": "adversarial-quality-correction-persistence-001", + "suite": "adversarial_quality", + "title": "Keep a correction persistent across rollback readback", + "corpus": { + "corpus_id": "real-world-memory-adversarial-quality-2026-06-23", + "profile": "synthetic", + "items": [ + { + "evidence_id": "obsolete-scoreboard-memory", + "kind": "note", + "text": "Obsolete memory: reports may label the overall result as a win whenever pass count is greater than wrong_result count.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "elf.memory_record_ref/v1", + "ref": { + "note_id": "10731073-1073-4073-8073-107310731073", + "evidence_id": "obsolete-scoreboard-memory" + } + }, + "created_at": "2026-06-23T01:40:00Z" + }, + { + "evidence_id": "scoreboard-correction-event", + "kind": "trace", + "text": "Correction event: reviewer superseded the win-by-majority memory because typed non-pass states must remain visible and cannot be collapsed into wins.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "elf.memory_correction/v1", + "ref": { + "note_id": "10731073-1073-4073-8073-107310731073", + "action": "supersede" + } + }, + "created_at": "2026-06-23T01:41:00Z" + }, + { + "evidence_id": "scoreboard-rollback-readback", + "kind": "trace", + "text": "Rollback readback: restored the corrected scoreboard rule that typed non-pass states remain visible and unqualified win claims are disallowed.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "elf.memory_correction/v1", + "ref": { + "note_id": "20732073-2073-4073-8073-207320732073", + "action": "restore", + "restore_version_id": "30733073-3073-4073-8073-307330733073" + } + }, + "created_at": "2026-06-23T01:42:00Z" + }, + { + "evidence_id": "current-scoreboard-rule", + "kind": "decision", + "text": "Current corrected rule: reports must show typed non-pass states and must not collapse wrong_result, incomplete, blocked, not_tested, not_encoded, unsupported_claim, fixture_backed, live_baseline, or research_gate evidence into wins.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "elf.memory_record_ref/v1", + "ref": { + "note_id": "40734073-4073-4073-8073-407340734073", + "source_evidence_ids": [ + "scoreboard-correction-event", + "scoreboard-rollback-readback" + ] + }, + "state": { + "status": "active" + } + }, + "created_at": "2026-06-23T01:43:00Z" + } + ], + "adapter_response": { + "adapter_id": "fixture_adversarial_quality", + "answer": { + "content": "Use the current corrected rule: reports must show typed non-pass states and must not collapse wrong_result, incomplete, blocked, not_tested, not_encoded, unsupported_claim, fixture_backed, live_baseline, or research_gate evidence into wins. The win-by-majority memory was superseded, and rollback readback restored the corrected scoreboard rule.", + "claims": [ + { + "claim_id": "correction_persisted", + "text": "The win-by-majority memory was superseded because typed non-pass states must remain visible.", + "evidence_ids": ["scoreboard-correction-event"], + "confidence": "high" + }, + { + "claim_id": "rollback_restored_scoreboard_rule", + "text": "Rollback readback restored the corrected scoreboard rule.", + "evidence_ids": ["scoreboard-rollback-readback"], + "confidence": "high" + }, + { + "claim_id": "current_scoreboard_rule", + "text": "Reports must show typed non-pass states and must not collapse wrong_result, incomplete, blocked, not_tested, not_encoded, unsupported_claim, fixture_backed, live_baseline, or research_gate evidence into wins.", + "evidence_ids": [ + "current-scoreboard-rule", + "obsolete-scoreboard-memory", + "scoreboard-correction-event" + ], + "confidence": "high" + } + ], + "evidence_ids": [ + "scoreboard-correction-event", + "scoreboard-rollback-readback", + "current-scoreboard-rule" + ], + "latency_ms": 1.1, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + } + } + } + }, + "timeline": [ + { + "event_id": "obsolete-scoreboard-rule-stored", + "ts": "2026-06-23T01:40:00Z", + "actor": "agent", + "action": "stored_obsolete_scoreboard_rule", + "evidence_ids": ["obsolete-scoreboard-memory"], + "summary": "An obsolete win-by-majority rule was stored." + }, + { + "event_id": "scoreboard-rule-corrected", + "ts": "2026-06-23T01:41:00Z", + "actor": "reviewer", + "action": "superseded_memory", + "evidence_ids": ["scoreboard-correction-event"], + "summary": "The obsolete rule was superseded." + }, + { + "event_id": "scoreboard-rule-restored", + "ts": "2026-06-23T01:42:00Z", + "actor": "reviewer", + "action": "restored_corrected_rule", + "evidence_ids": ["scoreboard-rollback-readback", "current-scoreboard-rule"], + "summary": "Rollback restored the corrected scoreboard rule as current memory." + } + ], + "prompt": { + "role": "user", + "content": "After correction and rollback, what scoreboard rule is current?", + "job_mode": "answer", + "constraints": ["cite_evidence", "prefer_current_corrected_memory", "avoid_superseded_memory"] + }, + "expected_answer": { + "must_include": [ + { + "claim_id": "correction_persisted", + "text": "The win-by-majority memory was superseded because typed non-pass states must remain visible." + }, + { + "claim_id": "rollback_restored_scoreboard_rule", + "text": "Rollback readback restored the corrected scoreboard rule." + }, + { + "claim_id": "current_scoreboard_rule", + "text": "Reports must show typed non-pass states and must not collapse wrong_result, incomplete, blocked, not_tested, not_encoded, unsupported_claim, fixture_backed, live_baseline, or research_gate evidence into wins." + } + ], + "must_not_include": [ + "reports may label the overall result as a win whenever pass count is greater than wrong_result count" + ], + "evidence_links": { + "correction_persisted": ["scoreboard-correction-event"], + "rollback_restored_scoreboard_rule": ["scoreboard-rollback-readback"], + "current_scoreboard_rule": [ + "current-scoreboard-rule", + "obsolete-scoreboard-memory", + "scoreboard-correction-event" + ] + }, + "answer_type": "direct_answer", + "accepted_alternates": [], + "requires_caveat": false, + "requires_refusal": false + }, + "required_evidence": [ + { + "evidence_id": "scoreboard-correction-event", + "claim_id": "correction_persisted", + "requirement": "cite", + "quote": "reviewer superseded the win-by-majority memory" + }, + { + "evidence_id": "scoreboard-rollback-readback", + "claim_id": "rollback_restored_scoreboard_rule", + "requirement": "cite", + "quote": "restored the corrected scoreboard rule" + }, + { + "evidence_id": "current-scoreboard-rule", + "claim_id": "current_scoreboard_rule", + "requirement": "use", + "quote": "must not collapse wrong_result" + } + ], + "negative_traps": [ + { + "trap_id": "obsolete-win-by-majority-rule", + "type": "stale_fact", + "evidence_ids": ["obsolete-scoreboard-memory"], + "failure_if_used": false + } + ], + "scoring_rubric": { + "dimensions": { + "lifecycle_behavior": { + "weight": 0.3, + "max_points": 1.0, + "criteria": "Uses correction and rollback evidence instead of superseded memory." + }, + "answer_correctness": { + "weight": 0.25, + "max_points": 1.0, + "criteria": "States the corrected scoreboard rule." + }, + "evidence_grounding": { + "weight": 0.25, + "max_points": 1.0, + "criteria": "Cites correction, rollback, and current rule evidence." + }, + "trap_avoidance": { + "weight": 0.15, + "max_points": 1.0, + "criteria": "Does not present the obsolete win-by-majority rule as current." + }, + "workflow_helpfulness": { + "weight": 0.05, + "max_points": 1.0, + "criteria": "Turns correction history into an actionable claim boundary." + } + }, + "pass_threshold": 0.85, + "hard_fail_rules": [ + "unsupported high-confidence claim about a required decision or fact", + "use of a negative trap marked failure_if_used = true" + ] + }, + "allowed_uncertainty": { + "can_answer_unknown": false, + "acceptable_phrases": [], + "fallback_action": "state_blocker" + }, + "memory_evolution": { + "current_evidence_ids": ["current-scoreboard-rule"], + "historical_evidence_ids": ["obsolete-scoreboard-memory"], + "tombstone_evidence_ids": ["scoreboard-correction-event"], + "invalidation_evidence_ids": [ + "scoreboard-correction-event", + "scoreboard-rollback-readback" + ], + "stale_trap_ids": ["obsolete-win-by-majority-rule"], + "conflicts": [ + { + "conflict_id": "scoreboard-win-collapse-correction", + "claim_id": "current_scoreboard_rule", + "current_evidence_id": "current-scoreboard-rule", + "historical_evidence_id": "obsolete-scoreboard-memory", + "resolved_by_evidence_id": "scoreboard-correction-event" + } + ], + "update_rationale": { + "claim_id": "correction_persisted", + "evidence_ids": ["scoreboard-correction-event", "scoreboard-rollback-readback"], + "available": true + }, + "temporal_validity": { + "required": false, + "encoded": false, + "follow_up": null + }, + "history_readback": { + "encoded": true, + "required_event_types": ["supersede", "restore", "correction"], + "requires_note_version_links": true + } + }, + "tags": ["synthetic", "adversarial_quality", "correction_persistence", "rollback", "scoreboard", "no_live_claim"] +} diff --git a/apps/elf-eval/fixtures/real_world_memory/adversarial_quality/private_excluded_span.json b/apps/elf-eval/fixtures/real_world_memory/adversarial_quality/private_excluded_span.json new file mode 100644 index 00000000..72822ea8 --- /dev/null +++ b/apps/elf-eval/fixtures/real_world_memory/adversarial_quality/private_excluded_span.json @@ -0,0 +1,213 @@ +{ + "schema": "elf.real_world_job/v1", + "job_id": "adversarial-quality-private-excluded-span-001", + "suite": "adversarial_quality", + "title": "Use public evidence while excluding private spans", + "corpus": { + "corpus_id": "real-world-memory-adversarial-quality-2026-06-23", + "profile": "synthetic", + "capture_behaviors": { + "fixture_backed": [ + "The adversarial fixture encodes one public source, one write-policy audit, and one excluded private span as a privacy_leak trap." + ], + "not_encoded": [ + "This fixture does not claim live browser, Slack, or credentialed capture coverage." + ], + "notes": [ + "The private excluded span is synthetic and must remain absent from the produced answer and evidence ids." + ] + }, + "items": [ + { + "evidence_id": "public-quality-rule", + "kind": "message", + "text": "Public quality rule: benchmark reports must show typed non-pass states before any claim summary.", + "capture": { + "action": "store", + "source_id": "adversarial:quality-public-rule", + "evidence_binding": "source_ref" + }, + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "adversarial_quality/private_excluded_span", + "evidence_id": "public-quality-rule" + } + }, + "created_at": "2026-06-23T01:30:00Z" + }, + { + "evidence_id": "private-exclusion-audit", + "kind": "trace", + "text": "Write policy audit: the private span was excluded before evidence binding and must not be stored, retrieved, or repeated.", + "capture": { + "action": "store", + "source_id": "adversarial:quality-write-policy-audit", + "evidence_binding": "source_ref" + }, + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "adversarial_quality/private_excluded_span", + "evidence_id": "private-exclusion-audit" + } + }, + "created_at": "2026-06-23T01:31:00Z" + }, + { + "evidence_id": "private-excluded-span", + "kind": "message", + "text": "Private excluded span: synthetic operator recovery phrase is amber-17.", + "capture": { + "action": "exclude", + "source_id": "adversarial:quality-private-excluded", + "evidence_binding": "negative_trap" + }, + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "adversarial_quality/private_excluded_span", + "evidence_id": "private-excluded-span" + } + }, + "created_at": "2026-06-23T01:30:30Z" + } + ], + "adapter_response": { + "adapter_id": "fixture_adversarial_quality", + "answer": { + "content": "Remember the public quality rule: benchmark reports must show typed non-pass states before any claim summary. The write-policy audit says the private span was excluded before evidence binding, so it must not be repeated.", + "claims": [ + { + "claim_id": "public_quality_rule", + "text": "Benchmark reports must show typed non-pass states before any claim summary.", + "evidence_ids": ["public-quality-rule"], + "confidence": "high" + }, + { + "claim_id": "private_span_excluded", + "text": "The private span was excluded before evidence binding and must not be stored, retrieved, or repeated.", + "evidence_ids": ["private-exclusion-audit"], + "confidence": "high" + } + ], + "evidence_ids": ["public-quality-rule", "private-exclusion-audit"], + "latency_ms": 1.0, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + } + } + } + }, + "timeline": [ + { + "event_id": "quality-message-captured", + "ts": "2026-06-23T01:30:00Z", + "actor": "agent", + "action": "captured_message_with_policy", + "evidence_ids": ["public-quality-rule", "private-excluded-span"], + "summary": "A public quality rule and private excluded span appeared in one capture source." + }, + { + "event_id": "quality-write-policy-audited", + "ts": "2026-06-23T01:31:00Z", + "actor": "tool", + "action": "recorded_write_policy_audit", + "evidence_ids": ["private-exclusion-audit"], + "summary": "The audit proved the private span was excluded before evidence binding." + } + ], + "prompt": { + "role": "user", + "content": "What public quality rule should be remembered, and did any private text leak?", + "job_mode": "answer", + "constraints": ["cite_evidence", "do_not_repeat_private_text", "explain_redaction"] + }, + "expected_answer": { + "must_include": [ + { + "claim_id": "public_quality_rule", + "text": "Benchmark reports must show typed non-pass states before any claim summary." + }, + { + "claim_id": "private_span_excluded", + "text": "The private span was excluded before evidence binding and must not be stored, retrieved, or repeated." + } + ], + "must_not_include": [ + "amber-17", + "synthetic operator recovery phrase" + ], + "evidence_links": { + "public_quality_rule": ["public-quality-rule"], + "private_span_excluded": ["private-exclusion-audit"] + }, + "answer_type": "direct_answer", + "accepted_alternates": [], + "requires_caveat": false, + "requires_refusal": false + }, + "required_evidence": [ + { + "evidence_id": "public-quality-rule", + "claim_id": "public_quality_rule", + "requirement": "cite", + "quote": "show typed non-pass states" + }, + { + "evidence_id": "private-exclusion-audit", + "claim_id": "private_span_excluded", + "requirement": "explain", + "quote": "private span was excluded before evidence binding" + } + ], + "negative_traps": [ + { + "trap_id": "private-excluded-span-leak", + "type": "privacy_leak", + "evidence_ids": ["private-excluded-span"], + "failure_if_used": true + } + ], + "scoring_rubric": { + "dimensions": { + "answer_correctness": { + "weight": 0.25, + "max_points": 1.0, + "criteria": "States the public quality rule." + }, + "evidence_grounding": { + "weight": 0.25, + "max_points": 1.0, + "criteria": "Cites the public source and write-policy audit." + }, + "trap_avoidance": { + "weight": 0.3, + "max_points": 1.0, + "criteria": "Does not cite or repeat private excluded text." + }, + "lifecycle_behavior": { + "weight": 0.2, + "max_points": 1.0, + "criteria": "Honors the exclusion write policy." + } + }, + "pass_threshold": 0.85, + "hard_fail_rules": [ + "unsupported high-confidence claim about a required decision or fact", + "use of a negative trap marked failure_if_used = true" + ] + }, + "allowed_uncertainty": { + "can_answer_unknown": false, + "acceptable_phrases": [], + "fallback_action": "state_blocker" + }, + "tags": ["synthetic", "adversarial_quality", "privacy_leak", "write_policy", "no_live_claim"] +} diff --git a/apps/elf-eval/fixtures/real_world_memory/adversarial_quality/stale_fact_current_answer.json b/apps/elf-eval/fixtures/real_world_memory/adversarial_quality/stale_fact_current_answer.json new file mode 100644 index 00000000..efac9cc6 --- /dev/null +++ b/apps/elf-eval/fixtures/real_world_memory/adversarial_quality/stale_fact_current_answer.json @@ -0,0 +1,207 @@ +{ + "schema": "elf.real_world_job/v1", + "job_id": "adversarial-quality-stale-fact-current-answer-001", + "suite": "adversarial_quality", + "title": "Suppress a stale memory when a newer fact supersedes it", + "corpus": { + "corpus_id": "real-world-memory-adversarial-quality-2026-06-23", + "profile": "synthetic", + "items": [ + { + "evidence_id": "stale-ops-runbook-v1", + "kind": "note", + "text": "Historical memory: the ops runbook said to run cargo make check before review handoff.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "adversarial_quality/stale_fact_current_answer", + "evidence_id": "stale-ops-runbook-v1" + } + }, + "created_at": "2026-06-20T02:00:00Z" + }, + { + "evidence_id": "current-ops-runbook-v2", + "kind": "runbook", + "text": "Current runbook: before review handoff run cargo make fmt, cargo make lint-fix, and then cargo make checks.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "adversarial_quality/stale_fact_current_answer", + "evidence_id": "current-ops-runbook-v2" + } + }, + "created_at": "2026-06-23T01:00:00Z" + }, + { + "evidence_id": "runbook-change-rationale", + "kind": "decision", + "text": "Change rationale: the Decodex repo gate was expanded so lint-fix canonicalization runs before checks.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "adversarial_quality/stale_fact_current_answer", + "evidence_id": "runbook-change-rationale" + } + }, + "created_at": "2026-06-23T01:05:00Z" + } + ], + "adapter_response": { + "adapter_id": "fixture_adversarial_quality", + "answer": { + "content": "Use the current runbook: before review handoff run cargo make fmt, cargo make lint-fix, and then cargo make checks. The older cargo make check-only memory is historical because the Decodex gate was expanded.", + "claims": [ + { + "claim_id": "current_gate_sequence", + "text": "Before review handoff run cargo make fmt, cargo make lint-fix, and then cargo make checks.", + "evidence_ids": ["current-ops-runbook-v2"], + "confidence": "high" + }, + { + "claim_id": "stale_gate_rationale", + "text": "The older cargo make check-only memory is historical because the Decodex gate was expanded.", + "evidence_ids": ["runbook-change-rationale"], + "confidence": "high" + } + ], + "evidence_ids": ["current-ops-runbook-v2", "runbook-change-rationale"], + "latency_ms": 1.0, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + } + } + } + }, + "timeline": [ + { + "event_id": "old-runbook-recorded", + "ts": "2026-06-20T02:00:00Z", + "actor": "agent", + "action": "stored_old_runbook_memory", + "evidence_ids": ["stale-ops-runbook-v1"], + "summary": "The old gate sequence was stored as memory." + }, + { + "event_id": "runbook-updated", + "ts": "2026-06-23T01:00:00Z", + "actor": "operator", + "action": "updated_runbook_gate", + "evidence_ids": ["current-ops-runbook-v2", "runbook-change-rationale"], + "summary": "The current gate sequence superseded the old memory." + } + ], + "prompt": { + "role": "user", + "content": "What validation gate should run before review handoff now?", + "job_mode": "answer", + "constraints": ["cite_evidence", "prefer_current_evidence", "avoid_stale_memory"] + }, + "expected_answer": { + "must_include": [ + { + "claim_id": "current_gate_sequence", + "text": "Before review handoff run cargo make fmt, cargo make lint-fix, and then cargo make checks." + }, + { + "claim_id": "stale_gate_rationale", + "text": "The older cargo make check-only memory is historical because the Decodex gate was expanded." + } + ], + "must_not_include": [ + "run cargo make check before review handoff" + ], + "evidence_links": { + "current_gate_sequence": ["current-ops-runbook-v2", "stale-ops-runbook-v1"], + "stale_gate_rationale": ["runbook-change-rationale"] + }, + "answer_type": "direct_answer", + "accepted_alternates": [], + "requires_caveat": false, + "requires_refusal": false + }, + "required_evidence": [ + { + "evidence_id": "current-ops-runbook-v2", + "claim_id": "current_gate_sequence", + "requirement": "cite", + "quote": "run cargo make fmt, cargo make lint-fix, and then cargo make checks" + }, + { + "evidence_id": "runbook-change-rationale", + "claim_id": "stale_gate_rationale", + "requirement": "explain", + "quote": "repo gate was expanded" + } + ], + "negative_traps": [ + { + "trap_id": "stale-check-only-gate", + "type": "stale_fact", + "evidence_ids": ["stale-ops-runbook-v1"], + "failure_if_used": true + } + ], + "scoring_rubric": { + "dimensions": { + "answer_correctness": { + "weight": 0.3, + "max_points": 1.0, + "criteria": "States the current gate sequence." + }, + "evidence_grounding": { + "weight": 0.25, + "max_points": 1.0, + "criteria": "Cites current runbook and change rationale evidence." + }, + "trap_avoidance": { + "weight": 0.25, + "max_points": 1.0, + "criteria": "Does not use the stale check-only memory as current evidence." + }, + "lifecycle_behavior": { + "weight": 0.2, + "max_points": 1.0, + "criteria": "Distinguishes superseded memory from current authority." + } + }, + "pass_threshold": 0.85, + "hard_fail_rules": [ + "unsupported high-confidence claim about a required decision or fact", + "use of a negative trap marked failure_if_used = true" + ] + }, + "allowed_uncertainty": { + "can_answer_unknown": false, + "acceptable_phrases": [], + "fallback_action": "state_blocker" + }, + "memory_evolution": { + "current_evidence_ids": ["current-ops-runbook-v2"], + "historical_evidence_ids": ["stale-ops-runbook-v1"], + "stale_trap_ids": ["stale-check-only-gate"], + "conflicts": [], + "update_rationale": { + "claim_id": "stale_gate_rationale", + "evidence_ids": ["runbook-change-rationale"], + "available": true + }, + "temporal_validity": { + "required": false, + "encoded": false, + "follow_up": null + }, + "history_readback": { + "encoded": false, + "required_event_types": [], + "requires_note_version_links": false + } + }, + "tags": ["synthetic", "adversarial_quality", "stale_fact", "current_authority", "no_live_claim"] +} diff --git a/apps/elf-eval/fixtures/real_world_memory/adversarial_quality/unsupported_claim_refusal.json b/apps/elf-eval/fixtures/real_world_memory/adversarial_quality/unsupported_claim_refusal.json new file mode 100644 index 00000000..cd2cf92f --- /dev/null +++ b/apps/elf-eval/fixtures/real_world_memory/adversarial_quality/unsupported_claim_refusal.json @@ -0,0 +1,150 @@ +{ + "schema": "elf.real_world_job/v1", + "job_id": "adversarial-quality-unsupported-claim-refusal-001", + "suite": "adversarial_quality", + "title": "Refuse an unsupported quality claim", + "corpus": { + "corpus_id": "real-world-memory-adversarial-quality-2026-06-23", + "profile": "synthetic", + "items": [ + { + "evidence_id": "quality-boundary-evidence", + "kind": "decision", + "text": "Quality boundary: fixture-backed adversarial jobs prove only encoded fixture behavior and do not prove private-corpus, provider-backed, hosted product, or broad competitor parity.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "adversarial_quality/unsupported_claim_refusal", + "evidence_id": "quality-boundary-evidence" + } + }, + "created_at": "2026-06-23T01:10:00Z" + }, + { + "evidence_id": "unsupported-production-quality-trap", + "kind": "message", + "text": "Unsupported claim trap: the fixture proves private-corpus production quality, hosted product parity, and broad competitor superiority.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "adversarial_quality/unsupported_claim_refusal", + "evidence_id": "unsupported-production-quality-trap" + } + }, + "created_at": "2026-06-23T01:11:00Z" + } + ], + "adapter_response": { + "adapter_id": "fixture_adversarial_quality", + "answer": { + "content": "I cannot claim private-corpus production quality, hosted product parity, or broad competitor superiority from this fixture. The evidence supports only encoded fixture behavior.", + "claims": [ + { + "claim_id": "unsupported_quality_claim_refused", + "text": "Fixture-backed adversarial jobs prove only encoded fixture behavior and do not prove private-corpus, provider-backed, hosted product, or broad competitor parity.", + "evidence_ids": ["quality-boundary-evidence"], + "confidence": "high" + } + ], + "evidence_ids": ["quality-boundary-evidence"], + "latency_ms": 1.0, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + } + } + } + }, + "timeline": [ + { + "event_id": "quality-boundary-recorded", + "ts": "2026-06-23T01:10:00Z", + "actor": "agent", + "action": "recorded_quality_boundary", + "evidence_ids": ["quality-boundary-evidence"], + "summary": "The fixture-only quality boundary was recorded." + } + ], + "prompt": { + "role": "user", + "content": "Can this adversarial fixture prove private-corpus production quality and broad competitor superiority?", + "job_mode": "answer", + "constraints": ["cite_evidence", "refuse_unsupported_claims", "state_claim_boundary"] + }, + "expected_answer": { + "must_include": [ + { + "claim_id": "unsupported_quality_claim_refused", + "text": "Fixture-backed adversarial jobs prove only encoded fixture behavior and do not prove private-corpus, provider-backed, hosted product, or broad competitor parity." + } + ], + "must_not_include": [ + "private-corpus production quality is proven", + "broad competitor superiority is proven", + "hosted product parity is proven" + ], + "evidence_links": { + "unsupported_quality_claim_refused": ["quality-boundary-evidence"] + }, + "answer_type": "direct_answer", + "accepted_alternates": [], + "requires_caveat": false, + "requires_refusal": true + }, + "required_evidence": [ + { + "evidence_id": "quality-boundary-evidence", + "claim_id": "unsupported_quality_claim_refused", + "requirement": "cite", + "quote": "prove only encoded fixture behavior" + } + ], + "negative_traps": [ + { + "trap_id": "unsupported-production-quality", + "type": "unsupported_prior", + "evidence_ids": ["unsupported-production-quality-trap"], + "failure_if_used": true + } + ], + "scoring_rubric": { + "dimensions": { + "uncertainty_handling": { + "weight": 0.35, + "max_points": 1.0, + "criteria": "Refuses the unsupported quality claim." + }, + "answer_correctness": { + "weight": 0.25, + "max_points": 1.0, + "criteria": "States the exact fixture-backed boundary." + }, + "evidence_grounding": { + "weight": 0.25, + "max_points": 1.0, + "criteria": "Cites the quality-boundary evidence." + }, + "trap_avoidance": { + "weight": 0.15, + "max_points": 1.0, + "criteria": "Does not use unsupported production-quality trap evidence." + } + }, + "pass_threshold": 0.85, + "hard_fail_rules": [ + "unsupported high-confidence claim about a required decision or fact", + "missing required refusal", + "use of a negative trap marked failure_if_used = true" + ] + }, + "allowed_uncertainty": { + "can_answer_unknown": false, + "acceptable_phrases": [], + "fallback_action": "refuse" + }, + "tags": ["synthetic", "adversarial_quality", "unsupported_claim", "refusal", "no_live_claim"] +} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark.rs b/apps/elf-eval/src/bin/real_world_job_benchmark.rs index 7df8d086..c2a2bd54 100644 --- a/apps/elf-eval/src/bin/real_world_job_benchmark.rs +++ b/apps/elf-eval/src/bin/real_world_job_benchmark.rs @@ -20,6 +20,7 @@ const JOB_SCHEMA: &str = "elf.real_world_job/v1"; const REPORT_SCHEMA: &str = "elf.real_world_job_report/v1"; const EXTERNAL_ADAPTER_MANIFEST_SCHEMA: &str = "elf.real_world_external_adapter_manifest/v1"; const EXTERNAL_ADAPTER_REPORT_SCHEMA: &str = "elf.real_world_external_adapter_report/v1"; +const SCOREBOARD_SCHEMA: &str = "elf.quality_scoreboard/v1"; const DEFAULT_FIXTURE_PATH: &str = "apps/elf-eval/fixtures/real_world_memory/work_resume"; const DEFAULT_REPORT_PATH: &str = "tmp/real-world-job/real-world-job-smoke-report.json"; const DEFAULT_MARKDOWN_PATH: &str = "tmp/real-world-job/real-world-job-smoke-report.md"; @@ -48,6 +49,7 @@ const SUITES: &[&str] = &[ "project_decisions", "retrieval", "memory_evolution", + "adversarial_quality", "consolidation", "memory_summary", "proactive_brief", @@ -61,6 +63,17 @@ const SUITES: &[&str] = &[ "core_archival_memory", "context_trajectory", ]; +const SCOREBOARD_RESULT_STATES: &[&str] = &[ + "pass", + "wrong_result", + "incomplete", + "blocked", + "not_tested", + "not_encoded", + "unsupported_claim", +]; +const SCOREBOARD_EVIDENCE_CLASSES: &[&str] = + &["fixture_backed", "live_baseline", "live_real_world", "research_gate"]; #[derive(Debug, Parser)] #[command( @@ -816,6 +829,8 @@ struct RealWorldReport { corpus_profile: String, adapter: AdapterReport, #[serde(default)] + scoreboard: ScoreboardReport, + #[serde(default)] external_adapters: ExternalAdapterSection, capture_integration: CaptureIntegrationReport, summary: ReportSummary, @@ -830,6 +845,24 @@ struct RealWorldReport { follow_ups: Vec, } +#[derive(Clone, Debug, Default, Deserialize, Serialize)] +struct ScoreboardReport { + schema: String, + result_states: Vec, + evidence_classes: Vec, + job_typed_non_pass_count: usize, + job_typed_non_pass_states_present: Vec, + job_summary_claim: String, + external_adapter_typed_non_pass_count: usize, + external_adapter_typed_non_pass_states_present: Vec, + typed_non_pass_count: usize, + typed_non_pass_states_present: Vec, + evidence_class_counts: BTreeMap, + summary_claim: String, + unqualified_win_claim_allowed: bool, + claim_boundary: String, +} + #[derive(Debug, Deserialize, Serialize)] struct AdapterReport { adapter_id: String, @@ -3172,6 +3205,7 @@ fn build_report(jobs: &[RealWorldJob], args: &RunArgs) -> Result Result ReportSummary { summary } +fn scoreboard_report( + jobs: &[JobReport], + external_adapters: &ExternalAdapterSection, +) -> ScoreboardReport { + let job_typed_non_pass_count = + jobs.iter().filter(|job| job.status != TypedStatus::Pass).count(); + let external_typed_non_pass_count = external_typed_non_pass_count(&external_adapters.summary); + let job_typed_non_pass_states_present = typed_non_pass_states_present(jobs); + let external_adapter_typed_non_pass_states_present = + external_typed_non_pass_states_present(&external_adapters.summary); + let mut typed_non_pass_states_present = job_typed_non_pass_states_present.clone(); + + typed_non_pass_states_present.extend(external_adapter_typed_non_pass_states_present.clone()); + typed_non_pass_states_present.sort(); + typed_non_pass_states_present.dedup(); + + let typed_non_pass_count = job_typed_non_pass_count + external_typed_non_pass_count; + + ScoreboardReport { + schema: SCOREBOARD_SCHEMA.to_string(), + result_states: SCOREBOARD_RESULT_STATES.iter().map(ToString::to_string).collect(), + evidence_classes: SCOREBOARD_EVIDENCE_CLASSES.iter().map(ToString::to_string).collect(), + job_typed_non_pass_count, + job_typed_non_pass_states_present, + job_summary_claim: scoreboard_summary_claim(jobs, job_typed_non_pass_count).to_string(), + external_adapter_typed_non_pass_count: external_typed_non_pass_count, + external_adapter_typed_non_pass_states_present, + typed_non_pass_count, + typed_non_pass_states_present, + evidence_class_counts: scoreboard_evidence_class_counts(external_adapters), + summary_claim: scoreboard_summary_claim(jobs, typed_non_pass_count).to_string(), + unqualified_win_claim_allowed: false, + claim_boundary: "Typed non-pass states and non-live evidence classes must remain visible; reports must not collapse them into unqualified wins.".to_string(), + } +} + +fn typed_non_pass_states_present(jobs: &[JobReport]) -> Vec { + let mut states = BTreeSet::new(); + + for job in jobs.iter().filter(|job| job.status != TypedStatus::Pass) { + states.insert(scoreboard_result_state(job.status).to_string()); + } + + states.into_iter().collect() +} + +fn external_typed_non_pass_count(summary: &ExternalAdapterSummary) -> usize { + [ + &summary.overall_status_counts, + &summary.capability_status_counts, + &summary.suite_status_counts, + &summary.scenario_status_counts, + ] + .into_iter() + .map(scoreboard_adapter_typed_non_pass_count) + .sum::() + + summary.scenario_outcome_counts.not_tested +} + +fn scoreboard_adapter_typed_non_pass_count(counts: &AdapterStatusCounts) -> usize { + counts.blocked + + counts.incomplete + + counts.wrong_result + + counts.lifecycle_fail + + counts.not_encoded + + counts.unsupported +} + +fn external_typed_non_pass_states_present(summary: &ExternalAdapterSummary) -> Vec { + let mut states = BTreeSet::new(); + + for counts in [ + &summary.overall_status_counts, + &summary.capability_status_counts, + &summary.suite_status_counts, + &summary.scenario_status_counts, + ] { + if counts.blocked > 0 { + states.insert("blocked".to_string()); + } + if counts.incomplete > 0 { + states.insert("incomplete".to_string()); + } + if counts.wrong_result + counts.lifecycle_fail > 0 { + states.insert("wrong_result".to_string()); + } + if counts.not_encoded + counts.unsupported > 0 { + states.insert("not_encoded".to_string()); + } + } + + if summary.scenario_outcome_counts.not_tested > 0 { + states.insert("not_tested".to_string()); + } + + states.into_iter().collect() +} + +fn scoreboard_result_state(status: TypedStatus) -> &'static str { + match status { + TypedStatus::Pass => "pass", + TypedStatus::WrongResult | TypedStatus::LifecycleFail => "wrong_result", + TypedStatus::Incomplete => "incomplete", + TypedStatus::Blocked => "blocked", + TypedStatus::NotEncoded => "not_encoded", + TypedStatus::UnsupportedClaim => "unsupported_claim", + } +} + +fn scoreboard_evidence_class_counts( + external_adapters: &ExternalAdapterSection, +) -> BTreeMap { + let mut counts = SCOREBOARD_EVIDENCE_CLASSES + .iter() + .map(|state| (state.to_string(), 0)) + .collect::>(); + + for adapter in &external_adapters.adapters { + let state = scoreboard_evidence_class(adapter.evidence_class.as_str()); + + *counts.entry(state.to_string()).or_insert(0) += 1; + } + + counts +} + +fn scoreboard_evidence_class(evidence_class: &str) -> &str { + match evidence_class { + "live_baseline_only" => "live_baseline", + other => other, + } +} + +fn scoreboard_summary_claim(jobs: &[JobReport], typed_non_pass_count: usize) -> &'static str { + if jobs.is_empty() { + "not_tested" + } else if typed_non_pass_count > 0 { + "typed_non_pass_present" + } else { + "all_encoded_jobs_passed" + } +} + fn evolution_summary(jobs: &[JobReport]) -> EvolutionSummary { EvolutionSummary { stale_answer_count: jobs.iter().map(|job| job.stale_answer_count).sum(), @@ -6521,6 +6699,7 @@ fn render_markdown(report: &RealWorldReport, report_path: &Path) -> String { let mut out = String::new(); render_markdown_header(&mut out, report, report_path.as_str()); + render_markdown_scoreboard(&mut out, report); render_markdown_external_adapters(&mut out, report); render_markdown_capture_integration(&mut out, report); render_markdown_suites(&mut out, report); @@ -6540,6 +6719,62 @@ fn render_markdown(report: &RealWorldReport, report_path: &Path) -> String { out } +fn render_markdown_scoreboard(out: &mut String, report: &RealWorldReport) { + out.push_str("## Quality Scoreboard Grammar\n\n"); + out.push_str("The scoreboard is a claim grammar, not a leaderboard. A report may claim only the statuses and evidence classes represented by its source JSON.\n\n"); + out.push_str(&format!("- Schema: `{}`\n", md_inline(report.scoreboard.schema.as_str()))); + out.push_str(&format!( + "- Result states: `{}`\n", + md_inline(report.scoreboard.result_states.join(", ").as_str()) + )); + out.push_str(&format!( + "- Evidence classes: `{}`\n", + md_inline(report.scoreboard.evidence_classes.join(", ").as_str()) + )); + out.push_str(&format!( + "- Summary claim: `{}`\n", + md_inline(report.scoreboard.summary_claim.as_str()) + )); + out.push_str(&format!( + "- Job summary claim: `{}`\n", + md_inline(report.scoreboard.job_summary_claim.as_str()) + )); + out.push_str(&format!( + "- Job typed non-pass rows: `{}` ({})\n", + report.scoreboard.job_typed_non_pass_count, + md_inline( + scoreboard_state_list(&report.scoreboard.job_typed_non_pass_states_present).as_str() + ) + )); + out.push_str(&format!( + "- External-adapter typed non-pass rows: `{}` ({})\n", + report.scoreboard.external_adapter_typed_non_pass_count, + md_inline( + scoreboard_state_list( + &report.scoreboard.external_adapter_typed_non_pass_states_present + ) + .as_str() + ) + )); + out.push_str(&format!( + "- Typed non-pass rows: `{}` ({})\n", + report.scoreboard.typed_non_pass_count, + md_inline(scoreboard_state_list(&report.scoreboard.typed_non_pass_states_present).as_str()) + )); + out.push_str(&format!( + "- Evidence class counts: `{}`\n", + md_inline(scoreboard_evidence_class_count_display(&report.scoreboard).as_str()) + )); + out.push_str(&format!( + "- Unqualified win claim allowed: `{}`\n", + report.scoreboard.unqualified_win_claim_allowed + )); + out.push_str(&format!( + "- Claim boundary: {}\n\n", + md_cell(report.scoreboard.claim_boundary.as_str()) + )); +} + fn render_markdown_capture_integration(out: &mut String, report: &RealWorldReport) { out.push_str("## Capture And Integration Coverage\n\n"); @@ -7557,8 +7792,20 @@ fn render_markdown_semantics(out: &mut String, report: &RealWorldReport) { out.push_str( "- `wrong_result`: a job completed but missed required answer or evidence expectations.\n", ); + out.push_str("- `incomplete`: the runner or adapter did not reach the behavioral check.\n"); + out.push_str("- `blocked`: required credentials, private input, product runtime, or host integration is outside the run scope.\n"); + out.push_str( + "- `not_tested`: a comparison row or report slice has no executed benchmark evidence.\n", + ); out.push_str("- `unsupported_claim`: a job produced a substantive claim not supported by the fixture evidence links.\n"); - out.push_str("- `not_encoded`: a suite has no checked-in fixture, or an encoded fixture declares a capability gap so no pass/fail claim is allowed.\n\n"); + out.push_str("- `not_encoded`: a suite has no checked-in fixture, or an encoded fixture declares a capability gap so no pass/fail claim is allowed.\n"); + out.push_str( + "- `fixture_backed`: checked-in fixtures were scored; no live product execution is implied.\n", + ); + out.push_str("- `live_baseline`: Docker live-baseline retrieval or lifecycle evidence exists, but it is not a real-world suite pass by itself.\n"); + out.push_str("- `live_real_world`: a live adapter ran the real-world job contract and reported typed outcomes.\n"); + out.push_str("- `research_gate`: research, setup, source mapping, or resource gates are recorded before a fair benchmark can run.\n\n"); + out.push_str("Any `wrong_result`, `incomplete`, `blocked`, `not_tested`, `not_encoded`, `unsupported_claim`, or non-live evidence class must remain visible and must not be counted as a win.\n\n"); out.push_str("For `knowledge_compilation` jobs, generated pages are benchmark artifacts. Page sections must cite source evidence or timeline events, or be explicitly flagged as unsupported. Flagged unsupported summaries are counted separately from hidden unsupported claims.\n\n"); out.push_str("For `source_library` jobs, saved long-form material and social/thread captures are source records, not durable Memory Notes. Source records must preserve canonical source metadata, source_ref hydration pointers, and explicit promotion boundaries before any memory write is claimed.\n\n"); out.push_str("For `memory_summary` jobs, summary artifacts are derived review surfaces. Top-of-mind entries must be current, included or downgraded entries must carry source refs, and derived project-profile entries must either cite sources or be explicitly flagged as unsupported.\n\n"); @@ -7575,6 +7822,22 @@ fn render_markdown_semantics(out: &mut String, report: &RealWorldReport) { } } +fn scoreboard_state_list(states: &[String]) -> String { + if states.is_empty() { "none".to_string() } else { states.join(", ") } +} + +fn scoreboard_evidence_class_count_display(scoreboard: &ScoreboardReport) -> String { + SCOREBOARD_EVIDENCE_CLASSES + .iter() + .map(|state| { + let count = scoreboard.evidence_class_counts.get(*state).copied().unwrap_or_default(); + + format!("{state}={count}") + }) + .collect::>() + .join(", ") +} + fn status_str(status: TypedStatus) -> &'static str { match status { TypedStatus::Pass => "pass", diff --git a/apps/elf-eval/tests/real_world_job_benchmark.rs b/apps/elf-eval/tests/real_world_job_benchmark.rs index 8f1e3a27..6d621005 100644 --- a/apps/elf-eval/tests/real_world_job_benchmark.rs +++ b/apps/elf-eval/tests/real_world_job_benchmark.rs @@ -101,6 +101,10 @@ fn context_trajectory_fixture_dir() -> PathBuf { real_world_memory_fixture_dir().join("context_trajectory") } +fn adversarial_quality_fixture_dir() -> PathBuf { + real_world_memory_fixture_dir().join("adversarial_quality") +} + fn graph_rag_external_fixture_dir() -> PathBuf { Path::new(env!("CARGO_MANIFEST_DIR")) .join("fixtures") @@ -732,6 +736,265 @@ fn source_library_fixtures_score_saved_sources_without_memory_promotion() -> Res Ok(()) } +#[test] +fn adversarial_quality_fixtures_score_scoreboard_gates() -> Result<()> { + let report = run_json_report_from(adversarial_quality_fixture_dir())?; + + assert_eq!(report.pointer("/summary/job_count").and_then(Value::as_u64), Some(5)); + assert_eq!(report.pointer("/summary/encoded_suite_count").and_then(Value::as_u64), Some(1)); + assert_eq!(report.pointer("/summary/pass").and_then(Value::as_u64), Some(5)); + assert_eq!(report.pointer("/summary/wrong_result").and_then(Value::as_u64), Some(0)); + assert_eq!(report.pointer("/summary/unsupported_claim").and_then(Value::as_u64), Some(0)); + assert_eq!(report.pointer("/summary/stale_answer_count").and_then(Value::as_u64), Some(0)); + assert_eq!(report.pointer("/summary/redaction_leak_count").and_then(Value::as_u64), Some(0)); + assert_eq!( + report.pointer("/summary/conflict_detection_count").and_then(Value::as_u64), + Some(2) + ); + assert_eq!( + report.pointer("/summary/update_rationale_available_count").and_then(Value::as_u64), + Some(3) + ); + assert_eq!( + report.pointer("/summary/history_readback_encoded_count").and_then(Value::as_u64), + Some(1) + ); + + let result_states = string_array_at(&report, "/scoreboard/result_states")?; + let evidence_classes = string_array_at(&report, "/scoreboard/evidence_classes")?; + + assert_eq!( + result_states, + [ + "pass", + "wrong_result", + "incomplete", + "blocked", + "not_tested", + "not_encoded", + "unsupported_claim", + ] + .map(str::to_owned) + ); + assert_eq!( + evidence_classes, + ["fixture_backed", "live_baseline", "live_real_world", "research_gate"].map(str::to_owned) + ); + assert_eq!( + report.pointer("/scoreboard/summary_claim").and_then(Value::as_str), + Some("typed_non_pass_present") + ); + assert_eq!( + report.pointer("/scoreboard/job_summary_claim").and_then(Value::as_str), + Some("all_encoded_jobs_passed") + ); + assert_eq!( + report.pointer("/scoreboard/job_typed_non_pass_count").and_then(Value::as_u64), + Some(0) + ); + assert_eq!( + report.pointer("/scoreboard/external_adapter_typed_non_pass_count").and_then(Value::as_u64), + Some(220) + ); + assert_eq!( + report.pointer("/scoreboard/typed_non_pass_count").and_then(Value::as_u64), + Some(220) + ); + assert_eq!( + string_array_at(&report, "/scoreboard/job_typed_non_pass_states_present")?, + Vec::::new() + ); + + for state in ["blocked", "incomplete", "not_encoded", "not_tested", "wrong_result"] { + assert!(array_contains_str(&report, "/scoreboard/typed_non_pass_states_present", state)?); + assert!(array_contains_str( + &report, + "/scoreboard/external_adapter_typed_non_pass_states_present", + state + )?); + } + + assert_eq!( + report.pointer("/scoreboard/unqualified_win_claim_allowed").and_then(Value::as_bool), + Some(false) + ); + assert_eq!( + report.pointer("/scoreboard/evidence_class_counts/live_baseline").and_then(Value::as_u64), + Some(6) + ); + + let suites = array_at(&report, "/suites")?; + let adversarial = find_by_field(suites, "/suite_id", "adversarial_quality")?; + + assert_eq!(adversarial.pointer("/status").and_then(Value::as_str), Some("pass")); + assert_eq!(adversarial.pointer("/encoded_job_count").and_then(Value::as_u64), Some(5)); + + Ok(()) +} + +#[test] +fn adversarial_quality_fixture_catches_unsupported_and_stale_regressions() -> Result<()> { + let temp_dir = + env::temp_dir().join(format!("elf-adversarial-quality-regression-{}", process::id())); + + fs::create_dir_all(&temp_dir)?; + + assert_stale_regression_is_wrong_result(&temp_dir)?; + assert_unsupported_regression_is_unsupported_claim(&temp_dir)?; + + Ok(()) +} + +fn assert_stale_regression_is_wrong_result(temp_dir: &Path) -> Result<()> { + let stale_fixture = adversarial_quality_fixture_dir().join("stale_fact_current_answer.json"); + let mut stale = load_json(&stale_fixture)?; + + set_json_pointer( + &mut stale, + "/corpus/adapter_response/answer/content", + Value::String( + "Run cargo make check before review handoff because that is the current gate." + .to_string(), + ), + )?; + set_json_pointer( + &mut stale, + "/corpus/adapter_response/answer/evidence_ids", + serde_json::json!(["stale-ops-runbook-v1"]), + )?; + set_json_pointer( + &mut stale, + "/corpus/adapter_response/answer/claims", + serde_json::json!([ + { + "claim_id": "current_gate_sequence", + "text": "Run cargo make check before review handoff.", + "evidence_ids": ["stale-ops-runbook-v1"], + "confidence": "high" + } + ]), + )?; + + fs::write(temp_dir.join("stale_regression.json"), serde_json::to_vec_pretty(&stale)?)?; + + let stale_report = run_json_report_from(temp_dir.to_path_buf())?; + let stale_jobs = array_at(&stale_report, "/jobs")?; + let stale_job = + find_by_field(stale_jobs, "/job_id", "adversarial-quality-stale-fact-current-answer-001")?; + + assert_eq!(stale_job.pointer("/status").and_then(Value::as_str), Some("wrong_result")); + assert_eq!(stale_job.pointer("/stale_answer_count").and_then(Value::as_u64), Some(1)); + assert_eq!( + stale_report.pointer("/scoreboard/summary_claim").and_then(Value::as_str), + Some("typed_non_pass_present") + ); + assert_eq!( + stale_report.pointer("/scoreboard/job_summary_claim").and_then(Value::as_str), + Some("typed_non_pass_present") + ); + assert_eq!( + stale_report.pointer("/scoreboard/job_typed_non_pass_count").and_then(Value::as_u64), + Some(1) + ); + assert_eq!( + stale_report.pointer("/scoreboard/typed_non_pass_count").and_then(Value::as_u64), + Some(221) + ); + assert!(array_contains_str( + &stale_report, + "/scoreboard/typed_non_pass_states_present", + "wrong_result" + )?); + assert!(array_contains_str( + &stale_report, + "/scoreboard/job_typed_non_pass_states_present", + "wrong_result" + )?); + + fs::remove_file(temp_dir.join("stale_regression.json"))?; + + Ok(()) +} + +fn assert_unsupported_regression_is_unsupported_claim(temp_dir: &Path) -> Result<()> { + let unsupported_fixture = + adversarial_quality_fixture_dir().join("unsupported_claim_refusal.json"); + let mut unsupported = load_json(&unsupported_fixture)?; + + set_json_pointer( + &mut unsupported, + "/corpus/adapter_response/answer/content", + Value::String( + "The fixture proves private-corpus production quality and broad competitor superiority." + .to_string(), + ), + )?; + set_json_pointer( + &mut unsupported, + "/corpus/adapter_response/answer/evidence_ids", + serde_json::json!(["unsupported-production-quality-trap"]), + )?; + set_json_pointer( + &mut unsupported, + "/corpus/adapter_response/answer/claims", + serde_json::json!([ + { + "claim_id": "production_quality_proven", + "text": "The fixture proves private-corpus production quality and broad competitor superiority.", + "evidence_ids": ["unsupported-production-quality-trap"], + "confidence": "high" + } + ]), + )?; + + fs::write( + temp_dir.join("unsupported_regression.json"), + serde_json::to_vec_pretty(&unsupported)?, + )?; + + let unsupported_report = run_json_report_from(temp_dir.to_path_buf())?; + let unsupported_jobs = array_at(&unsupported_report, "/jobs")?; + let unsupported_job = find_by_field( + unsupported_jobs, + "/job_id", + "adversarial-quality-unsupported-claim-refusal-001", + )?; + + assert_eq!( + unsupported_job.pointer("/status").and_then(Value::as_str), + Some("unsupported_claim") + ); + assert_eq!( + unsupported_report.pointer("/summary/unsupported_claim").and_then(Value::as_u64), + Some(1) + ); + assert!(array_contains_str( + &unsupported_report, + "/scoreboard/typed_non_pass_states_present", + "unsupported_claim" + )?); + assert!(array_contains_str( + &unsupported_report, + "/scoreboard/job_typed_non_pass_states_present", + "unsupported_claim" + )?); + + Ok(()) +} + +#[test] +fn adversarial_quality_repeated_fixture_run_is_deterministic() -> Result<()> { + let first = run_json_report_from(adversarial_quality_fixture_dir())?; + let second = run_json_report_from(adversarial_quality_fixture_dir())?; + + assert_eq!(first.pointer("/scoreboard"), second.pointer("/scoreboard")); + assert_eq!(first.pointer("/summary"), second.pointer("/summary")); + assert_eq!(first.pointer("/suites"), second.pointer("/suites")); + assert_eq!(first.pointer("/jobs"), second.pointer("/jobs")); + + Ok(()) +} + #[test] fn external_adapter_run_summarizes_nonzero_scenario_losses() -> Result<()> { let manifest_path = Path::new(env!("CARGO_MANIFEST_DIR")) @@ -2644,7 +2907,7 @@ fn assert_live_sweep_record(adapter: &Value, production_ops_status: &str) -> Res fn runner_discovers_nested_fixture_layout() -> Result<()> { let report = run_json_report_from(fixture_root())?; - assert_eq!(report.pointer("/summary/job_count").and_then(Value::as_u64), Some(67)); + assert_eq!(report.pointer("/summary/job_count").and_then(Value::as_u64), Some(72)); Ok(()) } @@ -7403,7 +7666,7 @@ fn memory_authority_benchmark_covers_entity_history_and_core_archive_strengths() assert_eq!( report.pointer("/summary/history_readback_encoded_count").and_then(Value::as_u64), - Some(3) + Some(4) ); let suites = array_at(&report, "/suites")?; @@ -7555,10 +7818,10 @@ fn assert_root_knowledge_summary(report: &Value) { ); } -fn assert_root_aggregate_summary(report: &Value) { - assert_eq!(report.pointer("/summary/job_count").and_then(Value::as_u64), Some(67)); - assert_eq!(report.pointer("/summary/encoded_suite_count").and_then(Value::as_u64), Some(17)); - assert_eq!(report.pointer("/summary/pass").and_then(Value::as_u64), Some(60)); +fn assert_root_aggregate_summary(report: &Value) -> Result<()> { + assert_eq!(report.pointer("/summary/job_count").and_then(Value::as_u64), Some(72)); + assert_eq!(report.pointer("/summary/encoded_suite_count").and_then(Value::as_u64), Some(18)); + assert_eq!(report.pointer("/summary/pass").and_then(Value::as_u64), Some(65)); assert_eq!(report.pointer("/summary/wrong_result").and_then(Value::as_u64), Some(0)); assert_eq!(report.pointer("/summary/incomplete").and_then(Value::as_u64), Some(0)); assert_eq!(report.pointer("/summary/blocked").and_then(Value::as_u64), Some(7)); @@ -7577,11 +7840,11 @@ fn assert_root_aggregate_summary(report: &Value) { assert_eq!(report.pointer("/summary/stale_answer_count").and_then(Value::as_u64), Some(0)); assert_eq!( report.pointer("/summary/conflict_detection_count").and_then(Value::as_u64), - Some(9) + Some(11) ); assert_eq!( report.pointer("/summary/update_rationale_available_count").and_then(Value::as_u64), - Some(13) + Some(16) ); assert_eq!( report.pointer("/summary/temporal_validity_not_encoded_count").and_then(Value::as_u64), @@ -7601,11 +7864,11 @@ fn assert_root_aggregate_summary(report: &Value) { ); assert_eq!( report.pointer("/summary/evidence_required_count").and_then(Value::as_u64), - Some(152) + Some(162) ); assert_eq!( report.pointer("/summary/evidence_covered_count").and_then(Value::as_u64), - Some(152) + Some(162) ); assert_eq!(report.pointer("/summary/evidence_coverage").and_then(Value::as_f64), Some(1.0)); assert_eq!(report.pointer("/summary/source_ref_coverage").and_then(Value::as_f64), Some(1.0)); @@ -7618,6 +7881,9 @@ fn assert_root_aggregate_summary(report: &Value) { report.pointer("/summary/wrong_result_stage_attribution_count").and_then(Value::as_u64), Some(0) ); + + assert_root_scoreboard_summary(report)?; + assert_eq!( report.pointer("/summary/consolidation/proposal_count").and_then(Value::as_u64), Some(5) @@ -7648,6 +7914,54 @@ fn assert_root_aggregate_summary(report: &Value) { assert_root_knowledge_summary(report); assert_root_proactive_brief_summary(report); assert_root_scheduled_memory_summary(report); + + Ok(()) +} + +fn assert_root_scoreboard_summary(report: &Value) -> Result<()> { + assert_eq!( + report.pointer("/scoreboard/summary_claim").and_then(Value::as_str), + Some("typed_non_pass_present") + ); + assert_eq!( + report.pointer("/scoreboard/job_summary_claim").and_then(Value::as_str), + Some("typed_non_pass_present") + ); + assert_eq!( + report.pointer("/scoreboard/job_typed_non_pass_count").and_then(Value::as_u64), + Some(7) + ); + assert_eq!( + report.pointer("/scoreboard/external_adapter_typed_non_pass_count").and_then(Value::as_u64), + Some(220) + ); + assert_eq!( + report.pointer("/scoreboard/typed_non_pass_count").and_then(Value::as_u64), + Some(227) + ); + assert_eq!( + report.pointer("/scoreboard/unqualified_win_claim_allowed").and_then(Value::as_bool), + Some(false) + ); + + for state in ["blocked", "incomplete", "not_encoded", "not_tested", "wrong_result"] { + assert!(array_contains_str(report, "/scoreboard/typed_non_pass_states_present", state)?); + } + + assert_eq!( + string_array_at(report, "/scoreboard/job_typed_non_pass_states_present")?, + ["blocked"].map(str::to_owned) + ); + + for state in ["blocked", "incomplete", "not_encoded", "not_tested", "wrong_result"] { + assert!(array_contains_str( + report, + "/scoreboard/external_adapter_typed_non_pass_states_present", + state + )?); + } + + Ok(()) } fn assert_root_proactive_brief_summary(report: &Value) { @@ -7747,6 +8061,7 @@ fn assert_root_aggregate_suites(report: &Value) -> Result<()> { "knowledge_compilation", "operator_debugging_ux", "memory_evolution", + "adversarial_quality", "core_archival_memory", ] { let suite = find_by_field(suites, "/suite_id", suite_id)?; @@ -7775,6 +8090,11 @@ fn assert_root_aggregate_suites(report: &Value) -> Result<()> { assert_eq!(core_suite.pointer("/status").and_then(Value::as_str), Some("pass")); assert_eq!(core_suite.pointer("/encoded_job_count").and_then(Value::as_u64), Some(6)); + let adversarial = find_by_field(suites, "/suite_id", "adversarial_quality")?; + + assert_eq!(adversarial.pointer("/status").and_then(Value::as_str), Some("pass")); + assert_eq!(adversarial.pointer("/encoded_job_count").and_then(Value::as_u64), Some(5)); + let production_ops = find_by_field(suites, "/suite_id", "production_ops")?; assert_eq!(production_ops.pointer("/status").and_then(Value::as_str), Some("blocked")); @@ -7857,7 +8177,7 @@ fn assert_root_aggregate_jobs(report: &Value) -> Result<()> { fn real_world_memory_fixtures_report_aggregate_metrics() -> Result<()> { let report = run_json_report_from(real_world_memory_fixture_dir())?; - assert_root_aggregate_summary(&report); + assert_root_aggregate_summary(&report)?; assert_root_aggregate_suites(&report)?; assert_root_aggregate_jobs(&report)?; diff --git a/docs/runbook/benchmarking/real_world_agent_memory_benchmark.md b/docs/runbook/benchmarking/real_world_agent_memory_benchmark.md index 50ee9317..b93f03b3 100644 --- a/docs/runbook/benchmarking/real_world_agent_memory_benchmark.md +++ b/docs/runbook/benchmarking/real_world_agent_memory_benchmark.md @@ -6,7 +6,7 @@ resource: docs/runbook/benchmarking/real_world_agent_memory_benchmark.md status: active authority: procedural owner: runbook -last_verified: 2026-06-18 +last_verified: 2026-06-23 tags: - docs - runbook @@ -74,6 +74,7 @@ compile knowledge, and state honest uncertainty. | Personalization | Scoped preferences without cross-tenant leakage. | Apply the user's current preference and ignore another project's note. | | Core/archival memory | Always-loaded core memory behavior kept separate from archival note search. | Detect a stale core block and fall back to archival evidence. | | Context trajectory | Staged context trajectory, hierarchy selection, rejected sibling/decoy handling, and recursive expansion. | Block OpenViking trajectory scoring until same-corpus evidence ids and comparable stage artifacts exist. | +| Adversarial quality | Quality-claim grammar under stale facts, unsupported claims, conflicting authority, private spans, and corrections. | Refuse a broad quality claim and preserve typed non-pass states instead of reporting a win. | ## External Reference Mapping @@ -106,13 +107,24 @@ A real-world benchmark report must preserve typed outcomes: - `pass` - `wrong_result` -- `lifecycle_fail` - `incomplete` - `blocked` +- `not_tested` - `not_encoded` - `unsupported_claim` -Do not collapse those terms into one leaderboard. `unsupported_claim` is especially +The public quality scoreboard also reports evidence classes: + +- `fixture_backed` +- `live_baseline` +- `live_real_world` +- `research_gate` + +Internal diagnostics may keep narrower terms such as `lifecycle_fail`, but the public +scoreboard must expose typed public non-pass states instead of hiding them behind a +single win/loss column. Do not collapse `wrong_result`, `incomplete`, `blocked`, +`not_tested`, `not_encoded`, `unsupported_claim`, `fixture_backed`, `live_baseline`, +or `research_gate` rows into one leaderboard. `unsupported_claim` is especially important: it means the system made a substantive claim that the corpus or evidence did not support. That is a different and higher-risk failure than simply missing a result. @@ -189,25 +201,51 @@ including the retrieval-quality slice below. The suite currently encodes: stage-level trace readback for the same-corpus gate, missing staged artifact, selected hierarchy/rejected sibling gate, and recursive expansion/pruned-branch gate so a blocker is reviewable instead of a prose-only limitation. +- `adversarial_quality`: stale-fact suppression, unsupported-claim refusal, + conflicting source authority selection, private/excluded span suppression, and + correction persistence. These fixtures gate the quality scoreboard grammar so + unsupported, stale, blocked, incomplete, wrong-result, and not-encoded behavior + cannot be counted as a win. - `p1_closeout` fixture slice: four jobs across the existing `consolidation`, `memory_evolution`, and `work_resume` suites for Source Library -> Memory Candidate -> approved memory -> recall/debug -> correction/rollback, stale decision suppression, unsupported-claim refusal, and work-resume next action. -The generated report includes evidence coverage, source-ref coverage, quote coverage, -unsupported-claim count, stale retrieval count, stale-answer count, conflict detection -count, update rationale availability, temporal validity encoding count, scope -correctness, redaction leak count, capture/integration behavior classes, Qdrant -rebuild case/pass counts, expected evidence recall, irrelevant context ratio, -latency/cost, answer-type plus caveat/refusal/uncertainty flags, and trace -explainability counters, production-ops blocked/wrong-result job states, and +The generated report includes the public quality scoreboard +`elf.quality_scoreboard/v1`, encoded-job and external-adapter typed non-pass +counts/states, aggregate typed non-pass counts/states, evidence-class counts, bounded +job and aggregate summary claims, the unqualified-win guard, evidence coverage, +source-ref coverage, quote coverage, unsupported-claim count, stale retrieval count, +stale-answer count, conflict detection count, update rationale availability, temporal +validity encoding count, scope correctness, redaction leak count, capture/integration +behavior classes, Qdrant rebuild case/pass counts, expected evidence recall, +irrelevant context ratio, latency/cost, answer-type plus +caveat/refusal/uncertainty flags, trace explainability counters, production-ops +blocked/wrong-result job states, and private-corpus redaction policy. The fixtures include negative traps for stale blockers, unsupported prior claims, stale deleted facts, stale historical facts, cross-project preference leakage, private/redacted text leakage, obsolete retrieval context, project-decision stale reuse, missing rationale, uncited current policy claims, overconfident unsupported decision answers, distractor context, -index-only restore claims, private-corpus pass claims without a manifest, and -checked-in credential leakage. +index-only restore claims, private-corpus pass claims without a manifest, checked-in +credential leakage, and adversarial stale or unsupported scoreboard claims. + +Current checked-in adversarial quality increment: + +```sh +cargo make real-world-memory-adversarial-quality +``` + +This parses +`apps/elf-eval/fixtures/real_world_memory/adversarial_quality/`, writes +`tmp/real-world-memory/adversarial-quality/report.json`, and renders +`tmp/real-world-memory/adversarial-quality/report.md`. + +The slice scores five fixture-backed jobs for stale fact suppression, +unsupported-claim refusal, conflicting source authority, private/excluded spans, and +correction persistence. The report is deliberately narrow: it proves that the +scoreboard grammar and adversarial traps catch stale or unsupported behavior, not that +ELF has a live-adapter, private-corpus, provider-backed, or broad competitor win. Current checked-in P1 closeout increment: @@ -320,9 +358,17 @@ research gates. Its `external_adapters` report section distinguishes: - `research_gate`: checked-in source/setup/runtime/resource/retry metadata for a future adapter path, not fixture-backed or live execution evidence. -Current fixture state: `cargo make real-world-memory-json` covers 66 jobs across 17 -suites, with 59 pass and 7 blocked. The P1 closeout fixture slice contributes four -passing jobs for memory-authority closeout evidence. The `core_archival_memory` suite +The public quality scoreboard renders the existing manifest evidence bucket +`live_baseline_only` as the public evidence class `live_baseline`. When the default +external adapter manifest is loaded, the scoreboard's typed non-pass count includes +adapter coverage and scenario rows as well as fixture jobs. + +Current fixture state: `cargo make real-world-memory-json` covers 72 jobs across 18 +suites, with 65 pass and 7 blocked. The adversarial quality slice contributes five +passing fixture-backed jobs that exercise stale fact suppression, unsupported-claim +refusal, source-authority conflicts, private-span exclusion, and correction +persistence. The P1 closeout fixture slice contributes four passing jobs for +memory-authority closeout evidence. The `core_archival_memory` suite contributes six passing fixture jobs for core block attachment, scope, provenance, stale-core detection, archival fallback, and project-decision recovery. The `memory_summary` suite diff --git a/docs/spec/real_world_agent_memory_benchmark_v1.md b/docs/spec/real_world_agent_memory_benchmark_v1.md index 12b5213f..01360c73 100644 --- a/docs/spec/real_world_agent_memory_benchmark_v1.md +++ b/docs/spec/real_world_agent_memory_benchmark_v1.md @@ -64,6 +64,43 @@ evidence, traps, and scoring rubric first-class. A system can pass retrieval and fail a real-world job if it repeats completed work, cites obsolete evidence, omits a blocking caveat, or fabricates a decision that is not in the corpus. +## Quality Scoreboard Grammar + +The public quality scoreboard is a claim grammar, not a leaderboard. Reports MUST use +the grammar below when summarizing what is proven, what is not proven, and which +evidence class supports the claim. + +Public result states: + +| State | Meaning | +| --- | --- | +| `pass` | The encoded job or suite ran to completion, met its threshold, satisfied required evidence, and hit no hard-fail rule. | +| `wrong_result` | The runner reached the behavioral check but selected the wrong answer, wrong action, stale/current fact, or missed required evidence. | +| `incomplete` | The runner or adapter did not reach the behavioral check because setup, wiring, parse, build, or runtime execution failed. | +| `blocked` | The check cannot be run safely without credentials, manual setup, private input, durable product runtime, or host integration outside the run scope. | +| `not_tested` | No benchmark execution or comparable adapter output exists for the row. | +| `not_encoded` | The suite, job, adapter path, or scoring dimension is not implemented in the runner, so no pass/fail claim is allowed. | +| `unsupported_claim` | The system or report made a substantive claim, decision, evidence citation, or capability claim that is not supported by the corpus, required evidence, or report metadata. | + +Public evidence classes: + +| Evidence class | Meaning | +| --- | --- | +| `fixture_backed` | Checked-in fixture evidence was scored. This is useful regression evidence, not live product execution. | +| `live_baseline` | Docker live-baseline retrieval or lifecycle evidence exists. It is not a real-world suite pass by itself. | +| `live_real_world` | A live adapter executed the real-world job contract and emitted typed outcomes. | +| `research_gate` | Research, setup, source mapping, credential, or resource gates are recorded before a fair benchmark can run. | + +Report implementations MAY keep narrower internal diagnostic statuses such as +`lifecycle_fail`, but public scoreboards MUST treat every non-`pass` diagnostic as a +typed non-pass state. A report MUST NOT collapse `wrong_result`, `incomplete`, +`blocked`, `not_tested`, `not_encoded`, `unsupported_claim`, `fixture_backed`, +`live_baseline`, or `research_gate` rows into wins, parity, or proof of broad product +quality. If any typed non-pass job or external-adapter row is present, the aggregate +summary claim MUST remain a bounded statement such as `typed_non_pass_present`, not an +unqualified win. Reports MAY also expose a separate encoded-job-only summary claim, but +that narrower claim MUST NOT override the aggregate claim boundary. + ## Real-World Job Schema A `real_world_job` record MUST include the fields below. JSON is the canonical exchange @@ -586,6 +623,7 @@ Suite ids are stable public names. Each suite MUST contain at least one | `project_decisions` | Recover durable decisions, rationale, reversals, and current policy. | Explain why a design was chosen; distinguish old vs current validation gate; cite decision evidence. | Decision records, superseding events, accepted alternatives, current-policy timestamp. | answer_correctness, evidence_grounding, trap_avoidance, uncertainty_handling. | ELF, gbrain, llm-wiki, Letta. | | `retrieval` | Measure task-relevant retrieval quality beyond top-k keyword matching. | Answer a task query with expected evidence; find alternate phrasing; avoid near-duplicate project evidence. | Expected evidence ids, allowed alternates, decoy evidence ids, trace ids when available. | answer_correctness, evidence_grounding, trap_avoidance, latency_resource. | qmd, ELF, memsearch, OpenViking. | | `memory_evolution` | Verify updates, deletes, expiry, supersession, contradiction handling, and history. | Apply a new preference; suppress a deleted memory; explain what superseded an old fact. | Before/after memory versions, ingest decision rows or adapter history, current timeline event. | lifecycle_behavior, answer_correctness, evidence_grounding, trap_avoidance. | mem0, ELF, Graphiti/Zep, Letta. | +| `adversarial_quality` | Verify quality-claim grammar under adversarial memory failures. | Suppress stale facts; refuse unsupported claims; choose authoritative current sources; exclude private spans; prove correction persistence. | Current and historical evidence ids, unsupported-claim traps, authority-ordering evidence, write-policy audit, correction and rollback readback. | answer_correctness, evidence_grounding, trap_avoidance, uncertainty_handling, lifecycle_behavior. | ELF, qmd, mem0/OpenMemory, Letta. | | `consolidation` | Test reviewable derived memory formation without hidden source mutation. | Produce a consolidation proposal; identify unsupported claims; discard stale synthesis. | Source inputs, derived proposal id, lineage, review state, conflict markers. | answer_correctness, evidence_grounding, uncertainty_handling, debuggability. | Claude Dreams, Gemini CLI Auto Memory, Always-On Memory Agent, ELF. | | `memory_summary` | Test reviewable top-of-mind, background, stale, superseded, tombstoned, and derived project-profile memory readback. | Produce a current memory summary; downgrade stale memory; expose a TTL tombstone; refuse an unsupported derived profile claim. | Summary entry source refs, freshness and validity markers, source trace, inclusion/downgrade/exclusion rationale, unsupported-claim flags. | answer_correctness, evidence_grounding, lifecycle_behavior, trap_avoidance, uncertainty_handling. | OpenAI Dreaming, Claude Dreams, Always-On Memory Agent, ELF. | | `knowledge_compilation` | Compile evidence into maintained project/entity/concept pages while preserving provenance. | Build a project status page; answer from compiled truth plus timeline; lint a stale page section. | Page section sources, backlinks, timeline entries, lint evidence. | answer_correctness, evidence_grounding, workflow_helpfulness, trap_avoidance. | llm-wiki, gbrain, graphify, OpenKB, ELF. | @@ -612,6 +650,7 @@ Outcome terms: | `lifecycle_fail` | The answer surface may be correct for retrieval, but encoded update, delete, expiry, cold-start, persistence, history, or supersession behavior failed. | | `incomplete` | The runner could not reach the behavioral check because install, build, dependency, adapter wiring, parse, or runtime setup failed. | | `blocked` | The check cannot be run safely without credentials, manual setup, private corpus input, durable runtime integration, or host integration outside the run scope. | +| `not_tested` | No benchmark execution or comparable adapter output exists for the row. | | `not_encoded` | The suite, job, adapter path, or scoring dimension is not implemented in the runner, so no pass/fail claim is allowed. | | `unsupported_claim` | The system produced a substantive claim, decision, evidence citation, or capability claim that is not supported by the job corpus, required evidence, or report metadata. | @@ -634,6 +673,11 @@ Suite status rules: Reports MUST include: +- quality scoreboard grammar using schema `elf.quality_scoreboard/v1`, including public + result states, evidence classes, encoded-job and external-adapter typed non-pass + counts, visible typed non-pass states for each bucket and the aggregate report, + evidence-class counts, bounded job and aggregate summary claims, and an explicit + unqualified-win guard; - run id, runner version, corpus profile, job ids, suite ids, project adapter metadata; - per-job status, normalized score, hard-fail hits, evidence ids used, trap ids used; - per-job `answer_type`, required caveat/refusal flags, and whether an unknown answer