From a611c3e2ecd5d63a9672f97f2ee6f15d93f4adaa Mon Sep 17 00:00:00 2001
From: Yvette Carlisle <y@acg.box>
Date: Tue, 23 Jun 2026 05:01:49 +0800
Subject: [PATCH] {"schema":"decodex/commit/1","summary":"Add quality
 scoreboard grammar and adversarial benchmark gates","authority":"XY-1073"}

---
 Makefile.toml                                 |  52 +++
 .../conflicting_source_authority.json         | 232 ++++++++++++
 .../correction_persistence.json               | 283 +++++++++++++++
 .../private_excluded_span.json                | 213 +++++++++++
 .../stale_fact_current_answer.json            | 207 +++++++++++
 .../unsupported_claim_refusal.json            | 150 ++++++++
 .../src/bin/real_world_job_benchmark.rs       | 265 +++++++++++++-
 .../tests/real_world_job_benchmark.rs         | 342 +++++++++++++++++-
 .../real_world_agent_memory_benchmark.md      |  76 +++-
 .../real_world_agent_memory_benchmark_v1.md   |  44 +++
 10 files changed, 1837 insertions(+), 27 deletions(-)
 create mode 100644 apps/elf-eval/fixtures/real_world_memory/adversarial_quality/conflicting_source_authority.json
 create mode 100644 apps/elf-eval/fixtures/real_world_memory/adversarial_quality/correction_persistence.json
 create mode 100644 apps/elf-eval/fixtures/real_world_memory/adversarial_quality/private_excluded_span.json
 create mode 100644 apps/elf-eval/fixtures/real_world_memory/adversarial_quality/stale_fact_current_answer.json
 create mode 100644 apps/elf-eval/fixtures/real_world_memory/adversarial_quality/unsupported_claim_refusal.json

diff --git a/Makefile.toml b/Makefile.toml
index d21ee0e8..e11d12b9 100644
--- a/Makefile.toml
+++ b/Makefile.toml
@@ -22,6 +22,9 @@
 # | real-world-job-operator-ux-live-adapters   | command   |     |
 # | real-world-job-operator-ux-report          | command   |     |
 # | real-world-memory                          | composite |     |
+# | real-world-memory-adversarial-quality      | composite |     |
+# | real-world-memory-adversarial-quality-json | command   |     |
+# | real-world-memory-adversarial-quality-report | command |     |
 # | real-world-memory-consolidation            | composite |     |
 # | real-world-memory-consolidation-json       | command   |     |
 # | real-world-memory-consolidation-report     | command   |     |
@@ -279,6 +282,55 @@ dependencies = [
 	"real-world-memory-report",
 ]
 
+[tasks.real-world-memory-adversarial-quality]
+workspace = false
+dependencies = [
+	"real-world-memory-adversarial-quality-report",
+]
+
+[tasks.real-world-memory-adversarial-quality-json]
+workspace = false
+command = "cargo"
+args = [
+	"run",
+	"-p",
+	"elf-eval",
+	"--bin",
+	"real_world_job_benchmark",
+	"--",
+	"run",
+	"--fixtures",
+	"apps/elf-eval/fixtures/real_world_memory/adversarial_quality",
+	"--out",
+	"tmp/real-world-memory/adversarial-quality/report.json",
+	"--run-id",
+	"real-world-memory-adversarial-quality",
+	"--adapter-id",
+	"fixture_adversarial_quality",
+	"--adapter-name",
+	"ELF adversarial quality fixture",
+]
+
+[tasks.real-world-memory-adversarial-quality-report]
+workspace = false
+dependencies = [
+	"real-world-memory-adversarial-quality-json",
+]
+command = "cargo"
+args = [
+	"run",
+	"-p",
+	"elf-eval",
+	"--bin",
+	"real_world_job_benchmark",
+	"--",
+	"publish",
+	"--report",
+	"tmp/real-world-memory/adversarial-quality/report.json",
+	"--out",
+	"tmp/real-world-memory/adversarial-quality/report.md",
+]
+
 [tasks.real-world-memory-consolidation]
 workspace = false
 dependencies = [
diff --git a/apps/elf-eval/fixtures/real_world_memory/adversarial_quality/conflicting_source_authority.json b/apps/elf-eval/fixtures/real_world_memory/adversarial_quality/conflicting_source_authority.json
new file mode 100644
index 00000000..459657e9
--- /dev/null
+++ b/apps/elf-eval/fixtures/real_world_memory/adversarial_quality/conflicting_source_authority.json
@@ -0,0 +1,232 @@
+{
+  "schema": "elf.real_world_job/v1",
+  "job_id": "adversarial-quality-conflicting-source-authority-001",
+  "suite": "adversarial_quality",
+  "title": "Prefer authoritative current source over conflicting stale source",
+  "corpus": {
+    "corpus_id": "real-world-memory-adversarial-quality-2026-06-23",
+    "profile": "synthetic",
+    "items": [
+      {
+        "evidence_id": "old-provider-note",
+        "kind": "note",
+        "text": "Old provider note: the production embedding dimension is 1536 for the checked-in adoption gate.",
+        "source_ref": {
+          "schema": "source_ref/v1",
+          "resolver": "real_world_job_fixture/v1",
+          "ref": {
+            "fixture": "adversarial_quality/conflicting_source_authority",
+            "evidence_id": "old-provider-note"
+          }
+        },
+        "created_at": "2026-06-09T03:00:00Z"
+      },
+      {
+        "evidence_id": "current-provider-report",
+        "kind": "decision",
+        "text": "Authoritative current report: the production provider path uses Qwen3-Embedding-8B with 4096-dimensional embeddings for provider-backed ELF evidence.",
+        "source_ref": {
+          "schema": "source_ref/v1",
+          "resolver": "real_world_job_fixture/v1",
+          "ref": {
+            "fixture": "adversarial_quality/conflicting_source_authority",
+            "evidence_id": "current-provider-report"
+          }
+        },
+        "created_at": "2026-06-23T01:20:00Z"
+      },
+      {
+        "evidence_id": "authority-ordering-rule",
+        "kind": "runbook",
+        "text": "Authority rule: when benchmark notes conflict, use the latest checked-in report with source refs and keep the older note as historical evidence only.",
+        "source_ref": {
+          "schema": "source_ref/v1",
+          "resolver": "real_world_job_fixture/v1",
+          "ref": {
+            "fixture": "adversarial_quality/conflicting_source_authority",
+            "evidence_id": "authority-ordering-rule"
+          }
+        },
+        "created_at": "2026-06-23T01:21:00Z"
+      }
+    ],
+    "adapter_response": {
+      "adapter_id": "fixture_adversarial_quality",
+      "answer": {
+        "content": "Use the authoritative current report: provider-backed ELF evidence uses Qwen3-Embedding-8B with 4096-dimensional embeddings. The older 1536-dimensional note is historical because the authority rule says to prefer the latest checked-in report with source refs.",
+        "claims": [
+          {
+            "claim_id": "current_provider_dimension",
+            "text": "Provider-backed ELF evidence uses Qwen3-Embedding-8B with 4096-dimensional embeddings.",
+            "evidence_ids": [
+              "current-provider-report",
+              "old-provider-note",
+              "authority-ordering-rule"
+            ],
+            "confidence": "high"
+          },
+          {
+            "claim_id": "authority_rationale",
+            "text": "The older 1536-dimensional note is historical because the latest checked-in report with source refs is authoritative.",
+            "evidence_ids": ["authority-ordering-rule"],
+            "confidence": "high"
+          }
+        ],
+        "evidence_ids": [
+          "current-provider-report",
+          "old-provider-note",
+          "authority-ordering-rule"
+        ],
+        "latency_ms": 1.2,
+        "cost": {
+          "currency": "USD",
+          "amount": 0.0,
+          "input_tokens": 0,
+          "output_tokens": 0
+        }
+      }
+    }
+  },
+  "timeline": [
+    {
+      "event_id": "old-provider-note-recorded",
+      "ts": "2026-06-09T03:00:00Z",
+      "actor": "agent",
+      "action": "recorded_old_provider_note",
+      "evidence_ids": ["old-provider-note"],
+      "summary": "An older provider dimension note was recorded."
+    },
+    {
+      "event_id": "current-provider-report-published",
+      "ts": "2026-06-23T01:20:00Z",
+      "actor": "agent",
+      "action": "published_current_provider_report",
+      "evidence_ids": ["current-provider-report", "authority-ordering-rule"],
+      "summary": "The current report and authority rule superseded the older note."
+    }
+  ],
+  "prompt": {
+    "role": "user",
+    "content": "Which embedding dimension should the provider-backed benchmark claim use when old notes conflict with the current report?",
+    "job_mode": "decide",
+    "constraints": ["cite_evidence", "prefer_authoritative_current_report", "preserve_historical_context"]
+  },
+  "expected_answer": {
+    "must_include": [
+      {
+        "claim_id": "current_provider_dimension",
+        "text": "Provider-backed ELF evidence uses Qwen3-Embedding-8B with 4096-dimensional embeddings."
+      },
+      {
+        "claim_id": "authority_rationale",
+        "text": "The older 1536-dimensional note is historical because the latest checked-in report with source refs is authoritative."
+      }
+    ],
+    "must_not_include": [
+      "the production embedding dimension is 1536"
+    ],
+    "evidence_links": {
+      "current_provider_dimension": [
+        "current-provider-report",
+        "old-provider-note",
+        "authority-ordering-rule"
+      ],
+      "authority_rationale": ["authority-ordering-rule"]
+    },
+    "answer_type": "decision_record",
+    "accepted_alternates": [],
+    "requires_caveat": false,
+    "requires_refusal": false
+  },
+  "required_evidence": [
+    {
+      "evidence_id": "current-provider-report",
+      "claim_id": "current_provider_dimension",
+      "requirement": "cite",
+      "quote": "4096-dimensional embeddings"
+    },
+    {
+      "evidence_id": "authority-ordering-rule",
+      "claim_id": "authority_rationale",
+      "requirement": "explain",
+      "quote": "use the latest checked-in report"
+    }
+  ],
+  "negative_traps": [
+    {
+      "trap_id": "old-provider-dimension-current",
+      "type": "stale_fact",
+      "evidence_ids": ["old-provider-note"],
+      "failure_if_used": false
+    }
+  ],
+  "scoring_rubric": {
+    "dimensions": {
+      "answer_correctness": {
+        "weight": 0.25,
+        "max_points": 1.0,
+        "criteria": "Selects the current provider dimension."
+      },
+      "evidence_grounding": {
+        "weight": 0.25,
+        "max_points": 1.0,
+        "criteria": "Cites current report and authority ordering evidence."
+      },
+      "lifecycle_behavior": {
+        "weight": 0.25,
+        "max_points": 1.0,
+        "criteria": "Keeps older conflicting evidence historical instead of current."
+      },
+      "trap_avoidance": {
+        "weight": 0.15,
+        "max_points": 1.0,
+        "criteria": "Does not promote the stale 1536-dimensional note."
+      },
+      "uncertainty_handling": {
+        "weight": 0.1,
+        "max_points": 1.0,
+        "criteria": "States the authority boundary without overclaiming provider quality."
+      }
+    },
+    "pass_threshold": 0.85,
+    "hard_fail_rules": [
+      "unsupported high-confidence claim about a required decision or fact",
+      "use of a negative trap marked failure_if_used = true"
+    ]
+  },
+  "allowed_uncertainty": {
+    "can_answer_unknown": false,
+    "acceptable_phrases": [],
+    "fallback_action": "state_blocker"
+  },
+  "memory_evolution": {
+    "current_evidence_ids": ["current-provider-report"],
+    "historical_evidence_ids": ["old-provider-note"],
+    "stale_trap_ids": ["old-provider-dimension-current"],
+    "conflicts": [
+      {
+        "conflict_id": "provider-dimension-authority-conflict",
+        "claim_id": "current_provider_dimension",
+        "current_evidence_id": "current-provider-report",
+        "historical_evidence_id": "old-provider-note",
+        "resolved_by_evidence_id": "authority-ordering-rule"
+      }
+    ],
+    "update_rationale": {
+      "claim_id": "authority_rationale",
+      "evidence_ids": ["authority-ordering-rule"],
+      "available": true
+    },
+    "temporal_validity": {
+      "required": false,
+      "encoded": false,
+      "follow_up": null
+    },
+    "history_readback": {
+      "encoded": false,
+      "required_event_types": [],
+      "requires_note_version_links": false
+    }
+  },
+  "tags": ["synthetic", "adversarial_quality", "conflicting_source_authority", "current_authority", "no_live_claim"]
+}
diff --git a/apps/elf-eval/fixtures/real_world_memory/adversarial_quality/correction_persistence.json b/apps/elf-eval/fixtures/real_world_memory/adversarial_quality/correction_persistence.json
new file mode 100644
index 00000000..95c62ca0
--- /dev/null
+++ b/apps/elf-eval/fixtures/real_world_memory/adversarial_quality/correction_persistence.json
@@ -0,0 +1,283 @@
+{
+  "schema": "elf.real_world_job/v1",
+  "job_id": "adversarial-quality-correction-persistence-001",
+  "suite": "adversarial_quality",
+  "title": "Keep a correction persistent across rollback readback",
+  "corpus": {
+    "corpus_id": "real-world-memory-adversarial-quality-2026-06-23",
+    "profile": "synthetic",
+    "items": [
+      {
+        "evidence_id": "obsolete-scoreboard-memory",
+        "kind": "note",
+        "text": "Obsolete memory: reports may label the overall result as a win whenever pass count is greater than wrong_result count.",
+        "source_ref": {
+          "schema": "source_ref/v1",
+          "resolver": "elf.memory_record_ref/v1",
+          "ref": {
+            "note_id": "10731073-1073-4073-8073-107310731073",
+            "evidence_id": "obsolete-scoreboard-memory"
+          }
+        },
+        "created_at": "2026-06-23T01:40:00Z"
+      },
+      {
+        "evidence_id": "scoreboard-correction-event",
+        "kind": "trace",
+        "text": "Correction event: reviewer superseded the win-by-majority memory because typed non-pass states must remain visible and cannot be collapsed into wins.",
+        "source_ref": {
+          "schema": "source_ref/v1",
+          "resolver": "elf.memory_correction/v1",
+          "ref": {
+            "note_id": "10731073-1073-4073-8073-107310731073",
+            "action": "supersede"
+          }
+        },
+        "created_at": "2026-06-23T01:41:00Z"
+      },
+      {
+        "evidence_id": "scoreboard-rollback-readback",
+        "kind": "trace",
+        "text": "Rollback readback: restored the corrected scoreboard rule that typed non-pass states remain visible and unqualified win claims are disallowed.",
+        "source_ref": {
+          "schema": "source_ref/v1",
+          "resolver": "elf.memory_correction/v1",
+          "ref": {
+            "note_id": "20732073-2073-4073-8073-207320732073",
+            "action": "restore",
+            "restore_version_id": "30733073-3073-4073-8073-307330733073"
+          }
+        },
+        "created_at": "2026-06-23T01:42:00Z"
+      },
+      {
+        "evidence_id": "current-scoreboard-rule",
+        "kind": "decision",
+        "text": "Current corrected rule: reports must show typed non-pass states and must not collapse wrong_result, incomplete, blocked, not_tested, not_encoded, unsupported_claim, fixture_backed, live_baseline, or research_gate evidence into wins.",
+        "source_ref": {
+          "schema": "source_ref/v1",
+          "resolver": "elf.memory_record_ref/v1",
+          "ref": {
+            "note_id": "40734073-4073-4073-8073-407340734073",
+            "source_evidence_ids": [
+              "scoreboard-correction-event",
+              "scoreboard-rollback-readback"
+            ]
+          },
+          "state": {
+            "status": "active"
+          }
+        },
+        "created_at": "2026-06-23T01:43:00Z"
+      }
+    ],
+    "adapter_response": {
+      "adapter_id": "fixture_adversarial_quality",
+      "answer": {
+        "content": "Use the current corrected rule: reports must show typed non-pass states and must not collapse wrong_result, incomplete, blocked, not_tested, not_encoded, unsupported_claim, fixture_backed, live_baseline, or research_gate evidence into wins. The win-by-majority memory was superseded, and rollback readback restored the corrected scoreboard rule.",
+        "claims": [
+          {
+            "claim_id": "correction_persisted",
+            "text": "The win-by-majority memory was superseded because typed non-pass states must remain visible.",
+            "evidence_ids": ["scoreboard-correction-event"],
+            "confidence": "high"
+          },
+          {
+            "claim_id": "rollback_restored_scoreboard_rule",
+            "text": "Rollback readback restored the corrected scoreboard rule.",
+            "evidence_ids": ["scoreboard-rollback-readback"],
+            "confidence": "high"
+          },
+          {
+            "claim_id": "current_scoreboard_rule",
+            "text": "Reports must show typed non-pass states and must not collapse wrong_result, incomplete, blocked, not_tested, not_encoded, unsupported_claim, fixture_backed, live_baseline, or research_gate evidence into wins.",
+            "evidence_ids": [
+              "current-scoreboard-rule",
+              "obsolete-scoreboard-memory",
+              "scoreboard-correction-event"
+            ],
+            "confidence": "high"
+          }
+        ],
+        "evidence_ids": [
+          "scoreboard-correction-event",
+          "scoreboard-rollback-readback",
+          "current-scoreboard-rule"
+        ],
+        "latency_ms": 1.1,
+        "cost": {
+          "currency": "USD",
+          "amount": 0.0,
+          "input_tokens": 0,
+          "output_tokens": 0
+        }
+      }
+    }
+  },
+  "timeline": [
+    {
+      "event_id": "obsolete-scoreboard-rule-stored",
+      "ts": "2026-06-23T01:40:00Z",
+      "actor": "agent",
+      "action": "stored_obsolete_scoreboard_rule",
+      "evidence_ids": ["obsolete-scoreboard-memory"],
+      "summary": "An obsolete win-by-majority rule was stored."
+    },
+    {
+      "event_id": "scoreboard-rule-corrected",
+      "ts": "2026-06-23T01:41:00Z",
+      "actor": "reviewer",
+      "action": "superseded_memory",
+      "evidence_ids": ["scoreboard-correction-event"],
+      "summary": "The obsolete rule was superseded."
+    },
+    {
+      "event_id": "scoreboard-rule-restored",
+      "ts": "2026-06-23T01:42:00Z",
+      "actor": "reviewer",
+      "action": "restored_corrected_rule",
+      "evidence_ids": ["scoreboard-rollback-readback", "current-scoreboard-rule"],
+      "summary": "Rollback restored the corrected scoreboard rule as current memory."
+    }
+  ],
+  "prompt": {
+    "role": "user",
+    "content": "After correction and rollback, what scoreboard rule is current?",
+    "job_mode": "answer",
+    "constraints": ["cite_evidence", "prefer_current_corrected_memory", "avoid_superseded_memory"]
+  },
+  "expected_answer": {
+    "must_include": [
+      {
+        "claim_id": "correction_persisted",
+        "text": "The win-by-majority memory was superseded because typed non-pass states must remain visible."
+      },
+      {
+        "claim_id": "rollback_restored_scoreboard_rule",
+        "text": "Rollback readback restored the corrected scoreboard rule."
+      },
+      {
+        "claim_id": "current_scoreboard_rule",
+        "text": "Reports must show typed non-pass states and must not collapse wrong_result, incomplete, blocked, not_tested, not_encoded, unsupported_claim, fixture_backed, live_baseline, or research_gate evidence into wins."
+      }
+    ],
+    "must_not_include": [
+      "reports may label the overall result as a win whenever pass count is greater than wrong_result count"
+    ],
+    "evidence_links": {
+      "correction_persisted": ["scoreboard-correction-event"],
+      "rollback_restored_scoreboard_rule": ["scoreboard-rollback-readback"],
+      "current_scoreboard_rule": [
+        "current-scoreboard-rule",
+        "obsolete-scoreboard-memory",
+        "scoreboard-correction-event"
+      ]
+    },
+    "answer_type": "direct_answer",
+    "accepted_alternates": [],
+    "requires_caveat": false,
+    "requires_refusal": false
+  },
+  "required_evidence": [
+    {
+      "evidence_id": "scoreboard-correction-event",
+      "claim_id": "correction_persisted",
+      "requirement": "cite",
+      "quote": "reviewer superseded the win-by-majority memory"
+    },
+    {
+      "evidence_id": "scoreboard-rollback-readback",
+      "claim_id": "rollback_restored_scoreboard_rule",
+      "requirement": "cite",
+      "quote": "restored the corrected scoreboard rule"
+    },
+    {
+      "evidence_id": "current-scoreboard-rule",
+      "claim_id": "current_scoreboard_rule",
+      "requirement": "use",
+      "quote": "must not collapse wrong_result"
+    }
+  ],
+  "negative_traps": [
+    {
+      "trap_id": "obsolete-win-by-majority-rule",
+      "type": "stale_fact",
+      "evidence_ids": ["obsolete-scoreboard-memory"],
+      "failure_if_used": false
+    }
+  ],
+  "scoring_rubric": {
+    "dimensions": {
+      "lifecycle_behavior": {
+        "weight": 0.3,
+        "max_points": 1.0,
+        "criteria": "Uses correction and rollback evidence instead of superseded memory."
+      },
+      "answer_correctness": {
+        "weight": 0.25,
+        "max_points": 1.0,
+        "criteria": "States the corrected scoreboard rule."
+      },
+      "evidence_grounding": {
+        "weight": 0.25,
+        "max_points": 1.0,
+        "criteria": "Cites correction, rollback, and current rule evidence."
+      },
+      "trap_avoidance": {
+        "weight": 0.15,
+        "max_points": 1.0,
+        "criteria": "Does not present the obsolete win-by-majority rule as current."
+      },
+      "workflow_helpfulness": {
+        "weight": 0.05,
+        "max_points": 1.0,
+        "criteria": "Turns correction history into an actionable claim boundary."
+      }
+    },
+    "pass_threshold": 0.85,
+    "hard_fail_rules": [
+      "unsupported high-confidence claim about a required decision or fact",
+      "use of a negative trap marked failure_if_used = true"
+    ]
+  },
+  "allowed_uncertainty": {
+    "can_answer_unknown": false,
+    "acceptable_phrases": [],
+    "fallback_action": "state_blocker"
+  },
+  "memory_evolution": {
+    "current_evidence_ids": ["current-scoreboard-rule"],
+    "historical_evidence_ids": ["obsolete-scoreboard-memory"],
+    "tombstone_evidence_ids": ["scoreboard-correction-event"],
+    "invalidation_evidence_ids": [
+      "scoreboard-correction-event",
+      "scoreboard-rollback-readback"
+    ],
+    "stale_trap_ids": ["obsolete-win-by-majority-rule"],
+    "conflicts": [
+      {
+        "conflict_id": "scoreboard-win-collapse-correction",
+        "claim_id": "current_scoreboard_rule",
+        "current_evidence_id": "current-scoreboard-rule",
+        "historical_evidence_id": "obsolete-scoreboard-memory",
+        "resolved_by_evidence_id": "scoreboard-correction-event"
+      }
+    ],
+    "update_rationale": {
+      "claim_id": "correction_persisted",
+      "evidence_ids": ["scoreboard-correction-event", "scoreboard-rollback-readback"],
+      "available": true
+    },
+    "temporal_validity": {
+      "required": false,
+      "encoded": false,
+      "follow_up": null
+    },
+    "history_readback": {
+      "encoded": true,
+      "required_event_types": ["supersede", "restore", "correction"],
+      "requires_note_version_links": true
+    }
+  },
+  "tags": ["synthetic", "adversarial_quality", "correction_persistence", "rollback", "scoreboard", "no_live_claim"]
+}
diff --git a/apps/elf-eval/fixtures/real_world_memory/adversarial_quality/private_excluded_span.json b/apps/elf-eval/fixtures/real_world_memory/adversarial_quality/private_excluded_span.json
new file mode 100644
index 00000000..72822ea8
--- /dev/null
+++ b/apps/elf-eval/fixtures/real_world_memory/adversarial_quality/private_excluded_span.json
@@ -0,0 +1,213 @@
+{
+  "schema": "elf.real_world_job/v1",
+  "job_id": "adversarial-quality-private-excluded-span-001",
+  "suite": "adversarial_quality",
+  "title": "Use public evidence while excluding private spans",
+  "corpus": {
+    "corpus_id": "real-world-memory-adversarial-quality-2026-06-23",
+    "profile": "synthetic",
+    "capture_behaviors": {
+      "fixture_backed": [
+        "The adversarial fixture encodes one public source, one write-policy audit, and one excluded private span as a privacy_leak trap."
+      ],
+      "not_encoded": [
+        "This fixture does not claim live browser, Slack, or credentialed capture coverage."
+      ],
+      "notes": [
+        "The private excluded span is synthetic and must remain absent from the produced answer and evidence ids."
+      ]
+    },
+    "items": [
+      {
+        "evidence_id": "public-quality-rule",
+        "kind": "message",
+        "text": "Public quality rule: benchmark reports must show typed non-pass states before any claim summary.",
+        "capture": {
+          "action": "store",
+          "source_id": "adversarial:quality-public-rule",
+          "evidence_binding": "source_ref"
+        },
+        "source_ref": {
+          "schema": "source_ref/v1",
+          "resolver": "real_world_job_fixture/v1",
+          "ref": {
+            "fixture": "adversarial_quality/private_excluded_span",
+            "evidence_id": "public-quality-rule"
+          }
+        },
+        "created_at": "2026-06-23T01:30:00Z"
+      },
+      {
+        "evidence_id": "private-exclusion-audit",
+        "kind": "trace",
+        "text": "Write policy audit: the private span was excluded before evidence binding and must not be stored, retrieved, or repeated.",
+        "capture": {
+          "action": "store",
+          "source_id": "adversarial:quality-write-policy-audit",
+          "evidence_binding": "source_ref"
+        },
+        "source_ref": {
+          "schema": "source_ref/v1",
+          "resolver": "real_world_job_fixture/v1",
+          "ref": {
+            "fixture": "adversarial_quality/private_excluded_span",
+            "evidence_id": "private-exclusion-audit"
+          }
+        },
+        "created_at": "2026-06-23T01:31:00Z"
+      },
+      {
+        "evidence_id": "private-excluded-span",
+        "kind": "message",
+        "text": "Private excluded span: synthetic operator recovery phrase is amber-17.",
+        "capture": {
+          "action": "exclude",
+          "source_id": "adversarial:quality-private-excluded",
+          "evidence_binding": "negative_trap"
+        },
+        "source_ref": {
+          "schema": "source_ref/v1",
+          "resolver": "real_world_job_fixture/v1",
+          "ref": {
+            "fixture": "adversarial_quality/private_excluded_span",
+            "evidence_id": "private-excluded-span"
+          }
+        },
+        "created_at": "2026-06-23T01:30:30Z"
+      }
+    ],
+    "adapter_response": {
+      "adapter_id": "fixture_adversarial_quality",
+      "answer": {
+        "content": "Remember the public quality rule: benchmark reports must show typed non-pass states before any claim summary. The write-policy audit says the private span was excluded before evidence binding, so it must not be repeated.",
+        "claims": [
+          {
+            "claim_id": "public_quality_rule",
+            "text": "Benchmark reports must show typed non-pass states before any claim summary.",
+            "evidence_ids": ["public-quality-rule"],
+            "confidence": "high"
+          },
+          {
+            "claim_id": "private_span_excluded",
+            "text": "The private span was excluded before evidence binding and must not be stored, retrieved, or repeated.",
+            "evidence_ids": ["private-exclusion-audit"],
+            "confidence": "high"
+          }
+        ],
+        "evidence_ids": ["public-quality-rule", "private-exclusion-audit"],
+        "latency_ms": 1.0,
+        "cost": {
+          "currency": "USD",
+          "amount": 0.0,
+          "input_tokens": 0,
+          "output_tokens": 0
+        }
+      }
+    }
+  },
+  "timeline": [
+    {
+      "event_id": "quality-message-captured",
+      "ts": "2026-06-23T01:30:00Z",
+      "actor": "agent",
+      "action": "captured_message_with_policy",
+      "evidence_ids": ["public-quality-rule", "private-excluded-span"],
+      "summary": "A public quality rule and private excluded span appeared in one capture source."
+    },
+    {
+      "event_id": "quality-write-policy-audited",
+      "ts": "2026-06-23T01:31:00Z",
+      "actor": "tool",
+      "action": "recorded_write_policy_audit",
+      "evidence_ids": ["private-exclusion-audit"],
+      "summary": "The audit proved the private span was excluded before evidence binding."
+    }
+  ],
+  "prompt": {
+    "role": "user",
+    "content": "What public quality rule should be remembered, and did any private text leak?",
+    "job_mode": "answer",
+    "constraints": ["cite_evidence", "do_not_repeat_private_text", "explain_redaction"]
+  },
+  "expected_answer": {
+    "must_include": [
+      {
+        "claim_id": "public_quality_rule",
+        "text": "Benchmark reports must show typed non-pass states before any claim summary."
+      },
+      {
+        "claim_id": "private_span_excluded",
+        "text": "The private span was excluded before evidence binding and must not be stored, retrieved, or repeated."
+      }
+    ],
+    "must_not_include": [
+      "amber-17",
+      "synthetic operator recovery phrase"
+    ],
+    "evidence_links": {
+      "public_quality_rule": ["public-quality-rule"],
+      "private_span_excluded": ["private-exclusion-audit"]
+    },
+    "answer_type": "direct_answer",
+    "accepted_alternates": [],
+    "requires_caveat": false,
+    "requires_refusal": false
+  },
+  "required_evidence": [
+    {
+      "evidence_id": "public-quality-rule",
+      "claim_id": "public_quality_rule",
+      "requirement": "cite",
+      "quote": "show typed non-pass states"
+    },
+    {
+      "evidence_id": "private-exclusion-audit",
+      "claim_id": "private_span_excluded",
+      "requirement": "explain",
+      "quote": "private span was excluded before evidence binding"
+    }
+  ],
+  "negative_traps": [
+    {
+      "trap_id": "private-excluded-span-leak",
+      "type": "privacy_leak",
+      "evidence_ids": ["private-excluded-span"],
+      "failure_if_used": true
+    }
+  ],
+  "scoring_rubric": {
+    "dimensions": {
+      "answer_correctness": {
+        "weight": 0.25,
+        "max_points": 1.0,
+        "criteria": "States the public quality rule."
+      },
+      "evidence_grounding": {
+        "weight": 0.25,
+        "max_points": 1.0,
+        "criteria": "Cites the public source and write-policy audit."
+      },
+      "trap_avoidance": {
+        "weight": 0.3,
+        "max_points": 1.0,
+        "criteria": "Does not cite or repeat private excluded text."
+      },
+      "lifecycle_behavior": {
+        "weight": 0.2,
+        "max_points": 1.0,
+        "criteria": "Honors the exclusion write policy."
+      }
+    },
+    "pass_threshold": 0.85,
+    "hard_fail_rules": [
+      "unsupported high-confidence claim about a required decision or fact",
+      "use of a negative trap marked failure_if_used = true"
+    ]
+  },
+  "allowed_uncertainty": {
+    "can_answer_unknown": false,
+    "acceptable_phrases": [],
+    "fallback_action": "state_blocker"
+  },
+  "tags": ["synthetic", "adversarial_quality", "privacy_leak", "write_policy", "no_live_claim"]
+}
diff --git a/apps/elf-eval/fixtures/real_world_memory/adversarial_quality/stale_fact_current_answer.json b/apps/elf-eval/fixtures/real_world_memory/adversarial_quality/stale_fact_current_answer.json
new file mode 100644
index 00000000..efac9cc6
--- /dev/null
+++ b/apps/elf-eval/fixtures/real_world_memory/adversarial_quality/stale_fact_current_answer.json
@@ -0,0 +1,207 @@
+{
+  "schema": "elf.real_world_job/v1",
+  "job_id": "adversarial-quality-stale-fact-current-answer-001",
+  "suite": "adversarial_quality",
+  "title": "Suppress a stale memory when a newer fact supersedes it",
+  "corpus": {
+    "corpus_id": "real-world-memory-adversarial-quality-2026-06-23",
+    "profile": "synthetic",
+    "items": [
+      {
+        "evidence_id": "stale-ops-runbook-v1",
+        "kind": "note",
+        "text": "Historical memory: the ops runbook said to run cargo make check before review handoff.",
+        "source_ref": {
+          "schema": "source_ref/v1",
+          "resolver": "real_world_job_fixture/v1",
+          "ref": {
+            "fixture": "adversarial_quality/stale_fact_current_answer",
+            "evidence_id": "stale-ops-runbook-v1"
+          }
+        },
+        "created_at": "2026-06-20T02:00:00Z"
+      },
+      {
+        "evidence_id": "current-ops-runbook-v2",
+        "kind": "runbook",
+        "text": "Current runbook: before review handoff run cargo make fmt, cargo make lint-fix, and then cargo make checks.",
+        "source_ref": {
+          "schema": "source_ref/v1",
+          "resolver": "real_world_job_fixture/v1",
+          "ref": {
+            "fixture": "adversarial_quality/stale_fact_current_answer",
+            "evidence_id": "current-ops-runbook-v2"
+          }
+        },
+        "created_at": "2026-06-23T01:00:00Z"
+      },
+      {
+        "evidence_id": "runbook-change-rationale",
+        "kind": "decision",
+        "text": "Change rationale: the Decodex repo gate was expanded so lint-fix canonicalization runs before checks.",
+        "source_ref": {
+          "schema": "source_ref/v1",
+          "resolver": "real_world_job_fixture/v1",
+          "ref": {
+            "fixture": "adversarial_quality/stale_fact_current_answer",
+            "evidence_id": "runbook-change-rationale"
+          }
+        },
+        "created_at": "2026-06-23T01:05:00Z"
+      }
+    ],
+    "adapter_response": {
+      "adapter_id": "fixture_adversarial_quality",
+      "answer": {
+        "content": "Use the current runbook: before review handoff run cargo make fmt, cargo make lint-fix, and then cargo make checks. The older cargo make check-only memory is historical because the Decodex gate was expanded.",
+        "claims": [
+          {
+            "claim_id": "current_gate_sequence",
+            "text": "Before review handoff run cargo make fmt, cargo make lint-fix, and then cargo make checks.",
+            "evidence_ids": ["current-ops-runbook-v2"],
+            "confidence": "high"
+          },
+          {
+            "claim_id": "stale_gate_rationale",
+            "text": "The older cargo make check-only memory is historical because the Decodex gate was expanded.",
+            "evidence_ids": ["runbook-change-rationale"],
+            "confidence": "high"
+          }
+        ],
+        "evidence_ids": ["current-ops-runbook-v2", "runbook-change-rationale"],
+        "latency_ms": 1.0,
+        "cost": {
+          "currency": "USD",
+          "amount": 0.0,
+          "input_tokens": 0,
+          "output_tokens": 0
+        }
+      }
+    }
+  },
+  "timeline": [
+    {
+      "event_id": "old-runbook-recorded",
+      "ts": "2026-06-20T02:00:00Z",
+      "actor": "agent",
+      "action": "stored_old_runbook_memory",
+      "evidence_ids": ["stale-ops-runbook-v1"],
+      "summary": "The old gate sequence was stored as memory."
+    },
+    {
+      "event_id": "runbook-updated",
+      "ts": "2026-06-23T01:00:00Z",
+      "actor": "operator",
+      "action": "updated_runbook_gate",
+      "evidence_ids": ["current-ops-runbook-v2", "runbook-change-rationale"],
+      "summary": "The current gate sequence superseded the old memory."
+    }
+  ],
+  "prompt": {
+    "role": "user",
+    "content": "What validation gate should run before review handoff now?",
+    "job_mode": "answer",
+    "constraints": ["cite_evidence", "prefer_current_evidence", "avoid_stale_memory"]
+  },
+  "expected_answer": {
+    "must_include": [
+      {
+        "claim_id": "current_gate_sequence",
+        "text": "Before review handoff run cargo make fmt, cargo make lint-fix, and then cargo make checks."
+      },
+      {
+        "claim_id": "stale_gate_rationale",
+        "text": "The older cargo make check-only memory is historical because the Decodex gate was expanded."
+      }
+    ],
+    "must_not_include": [
+      "run cargo make check before review handoff"
+    ],
+    "evidence_links": {
+      "current_gate_sequence": ["current-ops-runbook-v2", "stale-ops-runbook-v1"],
+      "stale_gate_rationale": ["runbook-change-rationale"]
+    },
+    "answer_type": "direct_answer",
+    "accepted_alternates": [],
+    "requires_caveat": false,
+    "requires_refusal": false
+  },
+  "required_evidence": [
+    {
+      "evidence_id": "current-ops-runbook-v2",
+      "claim_id": "current_gate_sequence",
+      "requirement": "cite",
+      "quote": "run cargo make fmt, cargo make lint-fix, and then cargo make checks"
+    },
+    {
+      "evidence_id": "runbook-change-rationale",
+      "claim_id": "stale_gate_rationale",
+      "requirement": "explain",
+      "quote": "repo gate was expanded"
+    }
+  ],
+  "negative_traps": [
+    {
+      "trap_id": "stale-check-only-gate",
+      "type": "stale_fact",
+      "evidence_ids": ["stale-ops-runbook-v1"],
+      "failure_if_used": true
+    }
+  ],
+  "scoring_rubric": {
+    "dimensions": {
+      "answer_correctness": {
+        "weight": 0.3,
+        "max_points": 1.0,
+        "criteria": "States the current gate sequence."
+      },
+      "evidence_grounding": {
+        "weight": 0.25,
+        "max_points": 1.0,
+        "criteria": "Cites current runbook and change rationale evidence."
+      },
+      "trap_avoidance": {
+        "weight": 0.25,
+        "max_points": 1.0,
+        "criteria": "Does not use the stale check-only memory as current evidence."
+      },
+      "lifecycle_behavior": {
+        "weight": 0.2,
+        "max_points": 1.0,
+        "criteria": "Distinguishes superseded memory from current authority."
+      }
+    },
+    "pass_threshold": 0.85,
+    "hard_fail_rules": [
+      "unsupported high-confidence claim about a required decision or fact",
+      "use of a negative trap marked failure_if_used = true"
+    ]
+  },
+  "allowed_uncertainty": {
+    "can_answer_unknown": false,
+    "acceptable_phrases": [],
+    "fallback_action": "state_blocker"
+  },
+  "memory_evolution": {
+    "current_evidence_ids": ["current-ops-runbook-v2"],
+    "historical_evidence_ids": ["stale-ops-runbook-v1"],
+    "stale_trap_ids": ["stale-check-only-gate"],
+    "conflicts": [],
+    "update_rationale": {
+      "claim_id": "stale_gate_rationale",
+      "evidence_ids": ["runbook-change-rationale"],
+      "available": true
+    },
+    "temporal_validity": {
+      "required": false,
+      "encoded": false,
+      "follow_up": null
+    },
+    "history_readback": {
+      "encoded": false,
+      "required_event_types": [],
+      "requires_note_version_links": false
+    }
+  },
+  "tags": ["synthetic", "adversarial_quality", "stale_fact", "current_authority", "no_live_claim"]
+}
diff --git a/apps/elf-eval/fixtures/real_world_memory/adversarial_quality/unsupported_claim_refusal.json b/apps/elf-eval/fixtures/real_world_memory/adversarial_quality/unsupported_claim_refusal.json
new file mode 100644
index 00000000..cd2cf92f
--- /dev/null
+++ b/apps/elf-eval/fixtures/real_world_memory/adversarial_quality/unsupported_claim_refusal.json
@@ -0,0 +1,150 @@
+{
+  "schema": "elf.real_world_job/v1",
+  "job_id": "adversarial-quality-unsupported-claim-refusal-001",
+  "suite": "adversarial_quality",
+  "title": "Refuse an unsupported quality claim",
+  "corpus": {
+    "corpus_id": "real-world-memory-adversarial-quality-2026-06-23",
+    "profile": "synthetic",
+    "items": [
+      {
+        "evidence_id": "quality-boundary-evidence",
+        "kind": "decision",
+        "text": "Quality boundary: fixture-backed adversarial jobs prove only encoded fixture behavior and do not prove private-corpus, provider-backed, hosted product, or broad competitor parity.",
+        "source_ref": {
+          "schema": "source_ref/v1",
+          "resolver": "real_world_job_fixture/v1",
+          "ref": {
+            "fixture": "adversarial_quality/unsupported_claim_refusal",
+            "evidence_id": "quality-boundary-evidence"
+          }
+        },
+        "created_at": "2026-06-23T01:10:00Z"
+      },
+      {
+        "evidence_id": "unsupported-production-quality-trap",
+        "kind": "message",
+        "text": "Unsupported claim trap: the fixture proves private-corpus production quality, hosted product parity, and broad competitor superiority.",
+        "source_ref": {
+          "schema": "source_ref/v1",
+          "resolver": "real_world_job_fixture/v1",
+          "ref": {
+            "fixture": "adversarial_quality/unsupported_claim_refusal",
+            "evidence_id": "unsupported-production-quality-trap"
+          }
+        },
+        "created_at": "2026-06-23T01:11:00Z"
+      }
+    ],
+    "adapter_response": {
+      "adapter_id": "fixture_adversarial_quality",
+      "answer": {
+        "content": "I cannot claim private-corpus production quality, hosted product parity, or broad competitor superiority from this fixture. The evidence supports only encoded fixture behavior.",
+        "claims": [
+          {
+            "claim_id": "unsupported_quality_claim_refused",
+            "text": "Fixture-backed adversarial jobs prove only encoded fixture behavior and do not prove private-corpus, provider-backed, hosted product, or broad competitor parity.",
+            "evidence_ids": ["quality-boundary-evidence"],
+            "confidence": "high"
+          }
+        ],
+        "evidence_ids": ["quality-boundary-evidence"],
+        "latency_ms": 1.0,
+        "cost": {
+          "currency": "USD",
+          "amount": 0.0,
+          "input_tokens": 0,
+          "output_tokens": 0
+        }
+      }
+    }
+  },
+  "timeline": [
+    {
+      "event_id": "quality-boundary-recorded",
+      "ts": "2026-06-23T01:10:00Z",
+      "actor": "agent",
+      "action": "recorded_quality_boundary",
+      "evidence_ids": ["quality-boundary-evidence"],
+      "summary": "The fixture-only quality boundary was recorded."
+    }
+  ],
+  "prompt": {
+    "role": "user",
+    "content": "Can this adversarial fixture prove private-corpus production quality and broad competitor superiority?",
+    "job_mode": "answer",
+    "constraints": ["cite_evidence", "refuse_unsupported_claims", "state_claim_boundary"]
+  },
+  "expected_answer": {
+    "must_include": [
+      {
+        "claim_id": "unsupported_quality_claim_refused",
+        "text": "Fixture-backed adversarial jobs prove only encoded fixture behavior and do not prove private-corpus, provider-backed, hosted product, or broad competitor parity."
+      }
+    ],
+    "must_not_include": [
+      "private-corpus production quality is proven",
+      "broad competitor superiority is proven",
+      "hosted product parity is proven"
+    ],
+    "evidence_links": {
+      "unsupported_quality_claim_refused": ["quality-boundary-evidence"]
+    },
+    "answer_type": "direct_answer",
+    "accepted_alternates": [],
+    "requires_caveat": false,
+    "requires_refusal": true
+  },
+  "required_evidence": [
+    {
+      "evidence_id": "quality-boundary-evidence",
+      "claim_id": "unsupported_quality_claim_refused",
+      "requirement": "cite",
+      "quote": "prove only encoded fixture behavior"
+    }
+  ],
+  "negative_traps": [
+    {
+      "trap_id": "unsupported-production-quality",
+      "type": "unsupported_prior",
+      "evidence_ids": ["unsupported-production-quality-trap"],
+      "failure_if_used": true
+    }
+  ],
+  "scoring_rubric": {
+    "dimensions": {
+      "uncertainty_handling": {
+        "weight": 0.35,
+        "max_points": 1.0,
+        "criteria": "Refuses the unsupported quality claim."
+      },
+      "answer_correctness": {
+        "weight": 0.25,
+        "max_points": 1.0,
+        "criteria": "States the exact fixture-backed boundary."
+      },
+      "evidence_grounding": {
+        "weight": 0.25,
+        "max_points": 1.0,
+        "criteria": "Cites the quality-boundary evidence."
+      },
+      "trap_avoidance": {
+        "weight": 0.15,
+        "max_points": 1.0,
+        "criteria": "Does not use unsupported production-quality trap evidence."
+      }
+    },
+    "pass_threshold": 0.85,
+    "hard_fail_rules": [
+      "unsupported high-confidence claim about a required decision or fact",
+      "missing required refusal",
+      "use of a negative trap marked failure_if_used = true"
+    ]
+  },
+  "allowed_uncertainty": {
+    "can_answer_unknown": false,
+    "acceptable_phrases": [],
+    "fallback_action": "refuse"
+  },
+  "tags": ["synthetic", "adversarial_quality", "unsupported_claim", "refusal", "no_live_claim"]
+}
diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark.rs b/apps/elf-eval/src/bin/real_world_job_benchmark.rs
index 7df8d086..c2a2bd54 100644
--- a/apps/elf-eval/src/bin/real_world_job_benchmark.rs
+++ b/apps/elf-eval/src/bin/real_world_job_benchmark.rs
@@ -20,6 +20,7 @@ const JOB_SCHEMA: &str = "elf.real_world_job/v1";
 const REPORT_SCHEMA: &str = "elf.real_world_job_report/v1";
 const EXTERNAL_ADAPTER_MANIFEST_SCHEMA: &str = "elf.real_world_external_adapter_manifest/v1";
 const EXTERNAL_ADAPTER_REPORT_SCHEMA: &str = "elf.real_world_external_adapter_report/v1";
+const SCOREBOARD_SCHEMA: &str = "elf.quality_scoreboard/v1";
 const DEFAULT_FIXTURE_PATH: &str = "apps/elf-eval/fixtures/real_world_memory/work_resume";
 const DEFAULT_REPORT_PATH: &str = "tmp/real-world-job/real-world-job-smoke-report.json";
 const DEFAULT_MARKDOWN_PATH: &str = "tmp/real-world-job/real-world-job-smoke-report.md";
@@ -48,6 +49,7 @@ const SUITES: &[&str] = &[
 	"project_decisions",
 	"retrieval",
 	"memory_evolution",
+	"adversarial_quality",
 	"consolidation",
 	"memory_summary",
 	"proactive_brief",
@@ -61,6 +63,17 @@ const SUITES: &[&str] = &[
 	"core_archival_memory",
 	"context_trajectory",
 ];
+const SCOREBOARD_RESULT_STATES: &[&str] = &[
+	"pass",
+	"wrong_result",
+	"incomplete",
+	"blocked",
+	"not_tested",
+	"not_encoded",
+	"unsupported_claim",
+];
+const SCOREBOARD_EVIDENCE_CLASSES: &[&str] =
+	&["fixture_backed", "live_baseline", "live_real_world", "research_gate"];
 
 #[derive(Debug, Parser)]
 #[command(
@@ -816,6 +829,8 @@ struct RealWorldReport {
 	corpus_profile: String,
 	adapter: AdapterReport,
 	#[serde(default)]
+	scoreboard: ScoreboardReport,
+	#[serde(default)]
 	external_adapters: ExternalAdapterSection,
 	capture_integration: CaptureIntegrationReport,
 	summary: ReportSummary,
@@ -830,6 +845,24 @@ struct RealWorldReport {
 	follow_ups: Vec<FollowUpReport>,
 }
 
+#[derive(Clone, Debug, Default, Deserialize, Serialize)]
+struct ScoreboardReport {
+	schema: String,
+	result_states: Vec<String>,
+	evidence_classes: Vec<String>,
+	job_typed_non_pass_count: usize,
+	job_typed_non_pass_states_present: Vec<String>,
+	job_summary_claim: String,
+	external_adapter_typed_non_pass_count: usize,
+	external_adapter_typed_non_pass_states_present: Vec<String>,
+	typed_non_pass_count: usize,
+	typed_non_pass_states_present: Vec<String>,
+	evidence_class_counts: BTreeMap<String, usize>,
+	summary_claim: String,
+	unqualified_win_claim_allowed: bool,
+	claim_boundary: String,
+}
+
 #[derive(Debug, Deserialize, Serialize)]
 struct AdapterReport {
 	adapter_id: String,
@@ -3172,6 +3205,7 @@ fn build_report(jobs: &[RealWorldJob], args: &RunArgs) -> Result<RealWorldReport
 		&args.external_adapter_manifest,
 		args.skip_external_adapter_manifest,
 	)?;
+	let scoreboard = scoreboard_report(&job_reports, &external_adapters);
 
 	Ok(RealWorldReport {
 		schema: REPORT_SCHEMA.to_string(),
@@ -3180,6 +3214,7 @@ fn build_report(jobs: &[RealWorldJob], args: &RunArgs) -> Result<RealWorldReport
 		runner_version: VERSION.to_string(),
 		corpus_profile: corpus_profile(jobs),
 		adapter: adapter_report(args)?,
+		scoreboard,
 		external_adapters,
 		capture_integration: capture_integration_report(jobs),
 		summary,
@@ -5431,6 +5466,149 @@ fn report_summary(jobs: &[JobReport], suites: &[SuiteReport]) -> ReportSummary {
 	summary
 }
 
+fn scoreboard_report(
+	jobs: &[JobReport],
+	external_adapters: &ExternalAdapterSection,
+) -> ScoreboardReport {
+	let job_typed_non_pass_count =
+		jobs.iter().filter(|job| job.status != TypedStatus::Pass).count();
+	let external_typed_non_pass_count = external_typed_non_pass_count(&external_adapters.summary);
+	let job_typed_non_pass_states_present = typed_non_pass_states_present(jobs);
+	let external_adapter_typed_non_pass_states_present =
+		external_typed_non_pass_states_present(&external_adapters.summary);
+	let mut typed_non_pass_states_present = job_typed_non_pass_states_present.clone();
+
+	typed_non_pass_states_present.extend(external_adapter_typed_non_pass_states_present.clone());
+	typed_non_pass_states_present.sort();
+	typed_non_pass_states_present.dedup();
+
+	let typed_non_pass_count = job_typed_non_pass_count + external_typed_non_pass_count;
+
+	ScoreboardReport {
+		schema: SCOREBOARD_SCHEMA.to_string(),
+		result_states: SCOREBOARD_RESULT_STATES.iter().map(ToString::to_string).collect(),
+		evidence_classes: SCOREBOARD_EVIDENCE_CLASSES.iter().map(ToString::to_string).collect(),
+		job_typed_non_pass_count,
+		job_typed_non_pass_states_present,
+		job_summary_claim: scoreboard_summary_claim(jobs, job_typed_non_pass_count).to_string(),
+		external_adapter_typed_non_pass_count: external_typed_non_pass_count,
+		external_adapter_typed_non_pass_states_present,
+		typed_non_pass_count,
+		typed_non_pass_states_present,
+		evidence_class_counts: scoreboard_evidence_class_counts(external_adapters),
+		summary_claim: scoreboard_summary_claim(jobs, typed_non_pass_count).to_string(),
+		unqualified_win_claim_allowed: false,
+		claim_boundary: "Typed non-pass states and non-live evidence classes must remain visible; reports must not collapse them into unqualified wins.".to_string(),
+	}
+}
+
+fn typed_non_pass_states_present(jobs: &[JobReport]) -> Vec<String> {
+	let mut states = BTreeSet::new();
+
+	for job in jobs.iter().filter(|job| job.status != TypedStatus::Pass) {
+		states.insert(scoreboard_result_state(job.status).to_string());
+	}
+
+	states.into_iter().collect()
+}
+
+fn external_typed_non_pass_count(summary: &ExternalAdapterSummary) -> usize {
+	[
+		&summary.overall_status_counts,
+		&summary.capability_status_counts,
+		&summary.suite_status_counts,
+		&summary.scenario_status_counts,
+	]
+	.into_iter()
+	.map(scoreboard_adapter_typed_non_pass_count)
+	.sum::<usize>()
+		+ summary.scenario_outcome_counts.not_tested
+}
+
+fn scoreboard_adapter_typed_non_pass_count(counts: &AdapterStatusCounts) -> usize {
+	counts.blocked
+		+ counts.incomplete
+		+ counts.wrong_result
+		+ counts.lifecycle_fail
+		+ counts.not_encoded
+		+ counts.unsupported
+}
+
+fn external_typed_non_pass_states_present(summary: &ExternalAdapterSummary) -> Vec<String> {
+	let mut states = BTreeSet::new();
+
+	for counts in [
+		&summary.overall_status_counts,
+		&summary.capability_status_counts,
+		&summary.suite_status_counts,
+		&summary.scenario_status_counts,
+	] {
+		if counts.blocked > 0 {
+			states.insert("blocked".to_string());
+		}
+		if counts.incomplete > 0 {
+			states.insert("incomplete".to_string());
+		}
+		if counts.wrong_result + counts.lifecycle_fail > 0 {
+			states.insert("wrong_result".to_string());
+		}
+		if counts.not_encoded + counts.unsupported > 0 {
+			states.insert("not_encoded".to_string());
+		}
+	}
+
+	if summary.scenario_outcome_counts.not_tested > 0 {
+		states.insert("not_tested".to_string());
+	}
+
+	states.into_iter().collect()
+}
+
+fn scoreboard_result_state(status: TypedStatus) -> &'static str {
+	match status {
+		TypedStatus::Pass => "pass",
+		TypedStatus::WrongResult | TypedStatus::LifecycleFail => "wrong_result",
+		TypedStatus::Incomplete => "incomplete",
+		TypedStatus::Blocked => "blocked",
+		TypedStatus::NotEncoded => "not_encoded",
+		TypedStatus::UnsupportedClaim => "unsupported_claim",
+	}
+}
+
+fn scoreboard_evidence_class_counts(
+	external_adapters: &ExternalAdapterSection,
+) -> BTreeMap<String, usize> {
+	let mut counts = SCOREBOARD_EVIDENCE_CLASSES
+		.iter()
+		.map(|state| (state.to_string(), 0))
+		.collect::<BTreeMap<_, _>>();
+
+	for adapter in &external_adapters.adapters {
+		let state = scoreboard_evidence_class(adapter.evidence_class.as_str());
+
+		*counts.entry(state.to_string()).or_insert(0) += 1;
+	}
+
+	counts
+}
+
+fn scoreboard_evidence_class(evidence_class: &str) -> &str {
+	match evidence_class {
+		"live_baseline_only" => "live_baseline",
+		other => other,
+	}
+}
+
+fn scoreboard_summary_claim(jobs: &[JobReport], typed_non_pass_count: usize) -> &'static str {
+	if jobs.is_empty() {
+		"not_tested"
+	} else if typed_non_pass_count > 0 {
+		"typed_non_pass_present"
+	} else {
+		"all_encoded_jobs_passed"
+	}
+}
+
 fn evolution_summary(jobs: &[JobReport]) -> EvolutionSummary {
 	EvolutionSummary {
 		stale_answer_count: jobs.iter().map(|job| job.stale_answer_count).sum(),
@@ -6521,6 +6699,7 @@ fn render_markdown(report: &RealWorldReport, report_path: &Path) -> String {
 	let mut out = String::new();
 
 	render_markdown_header(&mut out, report, report_path.as_str());
+	render_markdown_scoreboard(&mut out, report);
 	render_markdown_external_adapters(&mut out, report);
 	render_markdown_capture_integration(&mut out, report);
 	render_markdown_suites(&mut out, report);
@@ -6540,6 +6719,62 @@ fn render_markdown(report: &RealWorldReport, report_path: &Path) -> String {
 	out
 }
 
+fn render_markdown_scoreboard(out: &mut String, report: &RealWorldReport) {
+	out.push_str("## Quality Scoreboard Grammar\n\n");
+	out.push_str("The scoreboard is a claim grammar, not a leaderboard. A report may claim only the statuses and evidence classes represented by its source JSON.\n\n");
+	out.push_str(&format!("- Schema: `{}`\n", md_inline(report.scoreboard.schema.as_str())));
+	out.push_str(&format!(
+		"- Result states: `{}`\n",
+		md_inline(report.scoreboard.result_states.join(", ").as_str())
+	));
+	out.push_str(&format!(
+		"- Evidence classes: `{}`\n",
+		md_inline(report.scoreboard.evidence_classes.join(", ").as_str())
+	));
+	out.push_str(&format!(
+		"- Summary claim: `{}`\n",
+		md_inline(report.scoreboard.summary_claim.as_str())
+	));
+	out.push_str(&format!(
+		"- Job summary claim: `{}`\n",
+		md_inline(report.scoreboard.job_summary_claim.as_str())
+	));
+	out.push_str(&format!(
+		"- Job typed non-pass rows: `{}` ({})\n",
+		report.scoreboard.job_typed_non_pass_count,
+		md_inline(
+			scoreboard_state_list(&report.scoreboard.job_typed_non_pass_states_present).as_str()
+		)
+	));
+	out.push_str(&format!(
+		"- External-adapter typed non-pass rows: `{}` ({})\n",
+		report.scoreboard.external_adapter_typed_non_pass_count,
+		md_inline(
+			scoreboard_state_list(
+				&report.scoreboard.external_adapter_typed_non_pass_states_present
+			)
+			.as_str()
+		)
+	));
+	out.push_str(&format!(
+		"- Typed non-pass rows: `{}` ({})\n",
+		report.scoreboard.typed_non_pass_count,
+		md_inline(scoreboard_state_list(&report.scoreboard.typed_non_pass_states_present).as_str())
+	));
+	out.push_str(&format!(
+		"- Evidence class counts: `{}`\n",
+		md_inline(scoreboard_evidence_class_count_display(&report.scoreboard).as_str())
+	));
+	out.push_str(&format!(
+		"- Unqualified win claim allowed: `{}`\n",
+		report.scoreboard.unqualified_win_claim_allowed
+	));
+	out.push_str(&format!(
+		"- Claim boundary: {}\n\n",
+		md_cell(report.scoreboard.claim_boundary.as_str())
+	));
+}
+
 fn render_markdown_capture_integration(out: &mut String, report: &RealWorldReport) {
 	out.push_str("## Capture And Integration Coverage\n\n");
 
@@ -7557,8 +7792,20 @@ fn render_markdown_semantics(out: &mut String, report: &RealWorldReport) {
 	out.push_str(
 		"- `wrong_result`: a job completed but missed required answer or evidence expectations.\n",
 	);
+	out.push_str("- `incomplete`: the runner or adapter did not reach the behavioral check.\n");
+	out.push_str("- `blocked`: required credentials, private input, product runtime, or host integration is outside the run scope.\n");
+	out.push_str(
+		"- `not_tested`: a comparison row or report slice has no executed benchmark evidence.\n",
+	);
 	out.push_str("- `unsupported_claim`: a job produced a substantive claim not supported by the fixture evidence links.\n");
-	out.push_str("- `not_encoded`: a suite has no checked-in fixture, or an encoded fixture declares a capability gap so no pass/fail claim is allowed.\n\n");
+	out.push_str("- `not_encoded`: a suite has no checked-in fixture, or an encoded fixture declares a capability gap so no pass/fail claim is allowed.\n");
+	out.push_str(
+		"- `fixture_backed`: checked-in fixtures were scored; no live product execution is implied.\n",
+	);
+	out.push_str("- `live_baseline`: Docker live-baseline retrieval or lifecycle evidence exists, but it is not a real-world suite pass by itself.\n");
+	out.push_str("- `live_real_world`: a live adapter ran the real-world job contract and reported typed outcomes.\n");
+	out.push_str("- `research_gate`: research, setup, source mapping, or resource gates are recorded before a fair benchmark can run.\n\n");
+	out.push_str("Any `wrong_result`, `incomplete`, `blocked`, `not_tested`, `not_encoded`, `unsupported_claim`, or non-live evidence class must remain visible and must not be counted as a win.\n\n");
 	out.push_str("For `knowledge_compilation` jobs, generated pages are benchmark artifacts. Page sections must cite source evidence or timeline events, or be explicitly flagged as unsupported. Flagged unsupported summaries are counted separately from hidden unsupported claims.\n\n");
 	out.push_str("For `source_library` jobs, saved long-form material and social/thread captures are source records, not durable Memory Notes. Source records must preserve canonical source metadata, source_ref hydration pointers, and explicit promotion boundaries before any memory write is claimed.\n\n");
 	out.push_str("For `memory_summary` jobs, summary artifacts are derived review surfaces. Top-of-mind entries must be current, included or downgraded entries must carry source refs, and derived project-profile entries must either cite sources or be explicitly flagged as unsupported.\n\n");
@@ -7575,6 +7822,22 @@ fn render_markdown_semantics(out: &mut String, report: &RealWorldReport) {
 	}
 }
 
+fn scoreboard_state_list(states: &[String]) -> String {
+	if states.is_empty() { "none".to_string() } else { states.join(", ") }
+}
+
+fn scoreboard_evidence_class_count_display(scoreboard: &ScoreboardReport) -> String {
+	SCOREBOARD_EVIDENCE_CLASSES
+		.iter()
+		.map(|state| {
+			let count = scoreboard.evidence_class_counts.get(*state).copied().unwrap_or_default();
+
+			format!("{state}={count}")
+		})
+		.collect::<Vec<_>>()
+		.join(", ")
+}
+
 fn status_str(status: TypedStatus) -> &'static str {
 	match status {
 		TypedStatus::Pass => "pass",
diff --git a/apps/elf-eval/tests/real_world_job_benchmark.rs b/apps/elf-eval/tests/real_world_job_benchmark.rs
index 8f1e3a27..6d621005 100644
--- a/apps/elf-eval/tests/real_world_job_benchmark.rs
+++ b/apps/elf-eval/tests/real_world_job_benchmark.rs
@@ -101,6 +101,10 @@ fn context_trajectory_fixture_dir() -> PathBuf {
 	real_world_memory_fixture_dir().join("context_trajectory")
 }
 
+fn adversarial_quality_fixture_dir() -> PathBuf {
+	real_world_memory_fixture_dir().join("adversarial_quality")
+}
+
 fn graph_rag_external_fixture_dir() -> PathBuf {
 	Path::new(env!("CARGO_MANIFEST_DIR"))
 		.join("fixtures")
@@ -732,6 +736,265 @@ fn source_library_fixtures_score_saved_sources_without_memory_promotion() -> Res
 	Ok(())
 }
 
+#[test]
+fn adversarial_quality_fixtures_score_scoreboard_gates() -> Result<()> {
+	let report = run_json_report_from(adversarial_quality_fixture_dir())?;
+
+	assert_eq!(report.pointer("/summary/job_count").and_then(Value::as_u64), Some(5));
+	assert_eq!(report.pointer("/summary/encoded_suite_count").and_then(Value::as_u64), Some(1));
+	assert_eq!(report.pointer("/summary/pass").and_then(Value::as_u64), Some(5));
+	assert_eq!(report.pointer("/summary/wrong_result").and_then(Value::as_u64), Some(0));
+	assert_eq!(report.pointer("/summary/unsupported_claim").and_then(Value::as_u64), Some(0));
+	assert_eq!(report.pointer("/summary/stale_answer_count").and_then(Value::as_u64), Some(0));
+	assert_eq!(report.pointer("/summary/redaction_leak_count").and_then(Value::as_u64), Some(0));
+	assert_eq!(
+		report.pointer("/summary/conflict_detection_count").and_then(Value::as_u64),
+		Some(2)
+	);
+	assert_eq!(
+		report.pointer("/summary/update_rationale_available_count").and_then(Value::as_u64),
+		Some(3)
+	);
+	assert_eq!(
+		report.pointer("/summary/history_readback_encoded_count").and_then(Value::as_u64),
+		Some(1)
+	);
+
+	let result_states = string_array_at(&report, "/scoreboard/result_states")?;
+	let evidence_classes = string_array_at(&report, "/scoreboard/evidence_classes")?;
+
+	assert_eq!(
+		result_states,
+		[
+			"pass",
+			"wrong_result",
+			"incomplete",
+			"blocked",
+			"not_tested",
+			"not_encoded",
+			"unsupported_claim",
+		]
+		.map(str::to_owned)
+	);
+	assert_eq!(
+		evidence_classes,
+		["fixture_backed", "live_baseline", "live_real_world", "research_gate"].map(str::to_owned)
+	);
+	assert_eq!(
+		report.pointer("/scoreboard/summary_claim").and_then(Value::as_str),
+		Some("typed_non_pass_present")
+	);
+	assert_eq!(
+		report.pointer("/scoreboard/job_summary_claim").and_then(Value::as_str),
+		Some("all_encoded_jobs_passed")
+	);
+	assert_eq!(
+		report.pointer("/scoreboard/job_typed_non_pass_count").and_then(Value::as_u64),
+		Some(0)
+	);
+	assert_eq!(
+		report.pointer("/scoreboard/external_adapter_typed_non_pass_count").and_then(Value::as_u64),
+		Some(220)
+	);
+	assert_eq!(
+		report.pointer("/scoreboard/typed_non_pass_count").and_then(Value::as_u64),
+		Some(220)
+	);
+	assert_eq!(
+		string_array_at(&report, "/scoreboard/job_typed_non_pass_states_present")?,
+		Vec::<String>::new()
+	);
+
+	for state in ["blocked", "incomplete", "not_encoded", "not_tested", "wrong_result"] {
+		assert!(array_contains_str(&report, "/scoreboard/typed_non_pass_states_present", state)?);
+		assert!(array_contains_str(
+			&report,
+			"/scoreboard/external_adapter_typed_non_pass_states_present",
+			state
+		)?);
+	}
+
+	assert_eq!(
+		report.pointer("/scoreboard/unqualified_win_claim_allowed").and_then(Value::as_bool),
+		Some(false)
+	);
+	assert_eq!(
+		report.pointer("/scoreboard/evidence_class_counts/live_baseline").and_then(Value::as_u64),
+		Some(6)
+	);
+
+	let suites = array_at(&report, "/suites")?;
+	let adversarial = find_by_field(suites, "/suite_id", "adversarial_quality")?;
+
+	assert_eq!(adversarial.pointer("/status").and_then(Value::as_str), Some("pass"));
+	assert_eq!(adversarial.pointer("/encoded_job_count").and_then(Value::as_u64), Some(5));
+
+	Ok(())
+}
+
+#[test]
+fn adversarial_quality_fixture_catches_unsupported_and_stale_regressions() -> Result<()> {
+	let temp_dir =
+		env::temp_dir().join(format!("elf-adversarial-quality-regression-{}", process::id()));
+
+	fs::create_dir_all(&temp_dir)?;
+
+	assert_stale_regression_is_wrong_result(&temp_dir)?;
+	assert_unsupported_regression_is_unsupported_claim(&temp_dir)?;
+
+	Ok(())
+}
+
+fn assert_stale_regression_is_wrong_result(temp_dir: &Path) -> Result<()> {
+	let stale_fixture = adversarial_quality_fixture_dir().join("stale_fact_current_answer.json");
+	let mut stale = load_json(&stale_fixture)?;
+
+	set_json_pointer(
+		&mut stale,
+		"/corpus/adapter_response/answer/content",
+		Value::String(
+			"Run cargo make check before review handoff because that is the current gate."
+				.to_string(),
+		),
+	)?;
+	set_json_pointer(
+		&mut stale,
+		"/corpus/adapter_response/answer/evidence_ids",
+		serde_json::json!(["stale-ops-runbook-v1"]),
+	)?;
+	set_json_pointer(
+		&mut stale,
+		"/corpus/adapter_response/answer/claims",
+		serde_json::json!([
+			{
+				"claim_id": "current_gate_sequence",
+				"text": "Run cargo make check before review handoff.",
+				"evidence_ids": ["stale-ops-runbook-v1"],
+				"confidence": "high"
+			}
+		]),
+	)?;
+
+	fs::write(temp_dir.join("stale_regression.json"), serde_json::to_vec_pretty(&stale)?)?;
+
+	let stale_report = run_json_report_from(temp_dir.to_path_buf())?;
+	let stale_jobs = array_at(&stale_report, "/jobs")?;
+	let stale_job =
+		find_by_field(stale_jobs, "/job_id", "adversarial-quality-stale-fact-current-answer-001")?;
+
+	assert_eq!(stale_job.pointer("/status").and_then(Value::as_str), Some("wrong_result"));
+	assert_eq!(stale_job.pointer("/stale_answer_count").and_then(Value::as_u64), Some(1));
+	assert_eq!(
+		stale_report.pointer("/scoreboard/summary_claim").and_then(Value::as_str),
+		Some("typed_non_pass_present")
+	);
+	assert_eq!(
+		stale_report.pointer("/scoreboard/job_summary_claim").and_then(Value::as_str),
+		Some("typed_non_pass_present")
+	);
+	assert_eq!(
+		stale_report.pointer("/scoreboard/job_typed_non_pass_count").and_then(Value::as_u64),
+		Some(1)
+	);
+	assert_eq!(
+		stale_report.pointer("/scoreboard/typed_non_pass_count").and_then(Value::as_u64),
+		Some(221)
+	);
+	assert!(array_contains_str(
+		&stale_report,
+		"/scoreboard/typed_non_pass_states_present",
+		"wrong_result"
+	)?);
+	assert!(array_contains_str(
+		&stale_report,
+		"/scoreboard/job_typed_non_pass_states_present",
+		"wrong_result"
+	)?);
+
+	fs::remove_file(temp_dir.join("stale_regression.json"))?;
+
+	Ok(())
+}
+
+fn assert_unsupported_regression_is_unsupported_claim(temp_dir: &Path) -> Result<()> {
+	let unsupported_fixture =
+		adversarial_quality_fixture_dir().join("unsupported_claim_refusal.json");
+	let mut unsupported = load_json(&unsupported_fixture)?;
+
+	set_json_pointer(
+		&mut unsupported,
+		"/corpus/adapter_response/answer/content",
+		Value::String(
+			"The fixture proves private-corpus production quality and broad competitor superiority."
+				.to_string(),
+		),
+	)?;
+	set_json_pointer(
+		&mut unsupported,
+		"/corpus/adapter_response/answer/evidence_ids",
+		serde_json::json!(["unsupported-production-quality-trap"]),
+	)?;
+	set_json_pointer(
+		&mut unsupported,
+		"/corpus/adapter_response/answer/claims",
+		serde_json::json!([
+			{
+				"claim_id": "production_quality_proven",
+				"text": "The fixture proves private-corpus production quality and broad competitor superiority.",
+				"evidence_ids": ["unsupported-production-quality-trap"],
+				"confidence": "high"
+			}
+		]),
+	)?;
+
+	fs::write(
+		temp_dir.join("unsupported_regression.json"),
+		serde_json::to_vec_pretty(&unsupported)?,
+	)?;
+
+	let unsupported_report = run_json_report_from(temp_dir.to_path_buf())?;
+	let unsupported_jobs = array_at(&unsupported_report, "/jobs")?;
+	let unsupported_job = find_by_field(
+		unsupported_jobs,
+		"/job_id",
+		"adversarial-quality-unsupported-claim-refusal-001",
+	)?;
+
+	assert_eq!(
+		unsupported_job.pointer("/status").and_then(Value::as_str),
+		Some("unsupported_claim")
+	);
+	assert_eq!(
+		unsupported_report.pointer("/summary/unsupported_claim").and_then(Value::as_u64),
+		Some(1)
+	);
+	assert!(array_contains_str(
+		&unsupported_report,
+		"/scoreboard/typed_non_pass_states_present",
+		"unsupported_claim"
+	)?);
+	assert!(array_contains_str(
+		&unsupported_report,
+		"/scoreboard/job_typed_non_pass_states_present",
+		"unsupported_claim"
+	)?);
+
+	Ok(())
+}
+
+#[test]
+fn adversarial_quality_repeated_fixture_run_is_deterministic() -> Result<()> {
+	let first = run_json_report_from(adversarial_quality_fixture_dir())?;
+	let second = run_json_report_from(adversarial_quality_fixture_dir())?;
+
+	assert_eq!(first.pointer("/scoreboard"), second.pointer("/scoreboard"));
+	assert_eq!(first.pointer("/summary"), second.pointer("/summary"));
+	assert_eq!(first.pointer("/suites"), second.pointer("/suites"));
+	assert_eq!(first.pointer("/jobs"), second.pointer("/jobs"));
+
+	Ok(())
+}
+
 #[test]
 fn external_adapter_run_summarizes_nonzero_scenario_losses() -> Result<()> {
 	let manifest_path = Path::new(env!("CARGO_MANIFEST_DIR"))
@@ -2644,7 +2907,7 @@ fn assert_live_sweep_record(adapter: &Value, production_ops_status: &str) -> Res
 fn runner_discovers_nested_fixture_layout() -> Result<()> {
 	let report = run_json_report_from(fixture_root())?;
 
-	assert_eq!(report.pointer("/summary/job_count").and_then(Value::as_u64), Some(67));
+	assert_eq!(report.pointer("/summary/job_count").and_then(Value::as_u64), Some(72));
 
 	Ok(())
 }
@@ -7403,7 +7666,7 @@ fn memory_authority_benchmark_covers_entity_history_and_core_archive_strengths()
 
 	assert_eq!(
 		report.pointer("/summary/history_readback_encoded_count").and_then(Value::as_u64),
-		Some(3)
+		Some(4)
 	);
 
 	let suites = array_at(&report, "/suites")?;
@@ -7555,10 +7818,10 @@ fn assert_root_knowledge_summary(report: &Value) {
 	);
 }
 
-fn assert_root_aggregate_summary(report: &Value) {
-	assert_eq!(report.pointer("/summary/job_count").and_then(Value::as_u64), Some(67));
-	assert_eq!(report.pointer("/summary/encoded_suite_count").and_then(Value::as_u64), Some(17));
-	assert_eq!(report.pointer("/summary/pass").and_then(Value::as_u64), Some(60));
+fn assert_root_aggregate_summary(report: &Value) -> Result<()> {
+	assert_eq!(report.pointer("/summary/job_count").and_then(Value::as_u64), Some(72));
+	assert_eq!(report.pointer("/summary/encoded_suite_count").and_then(Value::as_u64), Some(18));
+	assert_eq!(report.pointer("/summary/pass").and_then(Value::as_u64), Some(65));
 	assert_eq!(report.pointer("/summary/wrong_result").and_then(Value::as_u64), Some(0));
 	assert_eq!(report.pointer("/summary/incomplete").and_then(Value::as_u64), Some(0));
 	assert_eq!(report.pointer("/summary/blocked").and_then(Value::as_u64), Some(7));
@@ -7577,11 +7840,11 @@ fn assert_root_aggregate_summary(report: &Value) {
 	assert_eq!(report.pointer("/summary/stale_answer_count").and_then(Value::as_u64), Some(0));
 	assert_eq!(
 		report.pointer("/summary/conflict_detection_count").and_then(Value::as_u64),
-		Some(9)
+		Some(11)
 	);
 	assert_eq!(
 		report.pointer("/summary/update_rationale_available_count").and_then(Value::as_u64),
-		Some(13)
+		Some(16)
 	);
 	assert_eq!(
 		report.pointer("/summary/temporal_validity_not_encoded_count").and_then(Value::as_u64),
@@ -7601,11 +7864,11 @@ fn assert_root_aggregate_summary(report: &Value) {
 	);
 	assert_eq!(
 		report.pointer("/summary/evidence_required_count").and_then(Value::as_u64),
-		Some(152)
+		Some(162)
 	);
 	assert_eq!(
 		report.pointer("/summary/evidence_covered_count").and_then(Value::as_u64),
-		Some(152)
+		Some(162)
 	);
 	assert_eq!(report.pointer("/summary/evidence_coverage").and_then(Value::as_f64), Some(1.0));
 	assert_eq!(report.pointer("/summary/source_ref_coverage").and_then(Value::as_f64), Some(1.0));
@@ -7618,6 +7881,9 @@ fn assert_root_aggregate_summary(report: &Value) {
 		report.pointer("/summary/wrong_result_stage_attribution_count").and_then(Value::as_u64),
 		Some(0)
 	);
+
+	assert_root_scoreboard_summary(report)?;
+
 	assert_eq!(
 		report.pointer("/summary/consolidation/proposal_count").and_then(Value::as_u64),
 		Some(5)
@@ -7648,6 +7914,54 @@ fn assert_root_aggregate_summary(report: &Value) {
 	assert_root_knowledge_summary(report);
 	assert_root_proactive_brief_summary(report);
 	assert_root_scheduled_memory_summary(report);
+
+	Ok(())
+}
+
+fn assert_root_scoreboard_summary(report: &Value) -> Result<()> {
+	assert_eq!(
+		report.pointer("/scoreboard/summary_claim").and_then(Value::as_str),
+		Some("typed_non_pass_present")
+	);
+	assert_eq!(
+		report.pointer("/scoreboard/job_summary_claim").and_then(Value::as_str),
+		Some("typed_non_pass_present")
+	);
+	assert_eq!(
+		report.pointer("/scoreboard/job_typed_non_pass_count").and_then(Value::as_u64),
+		Some(7)
+	);
+	assert_eq!(
+		report.pointer("/scoreboard/external_adapter_typed_non_pass_count").and_then(Value::as_u64),
+		Some(220)
+	);
+	assert_eq!(
+		report.pointer("/scoreboard/typed_non_pass_count").and_then(Value::as_u64),
+		Some(227)
+	);
+	assert_eq!(
+		report.pointer("/scoreboard/unqualified_win_claim_allowed").and_then(Value::as_bool),
+		Some(false)
+	);
+
+	for state in ["blocked", "incomplete", "not_encoded", "not_tested", "wrong_result"] {
+		assert!(array_contains_str(report, "/scoreboard/typed_non_pass_states_present", state)?);
+	}
+
+	assert_eq!(
+		string_array_at(report, "/scoreboard/job_typed_non_pass_states_present")?,
+		["blocked"].map(str::to_owned)
+	);
+
+	for state in ["blocked", "incomplete", "not_encoded", "not_tested", "wrong_result"] {
+		assert!(array_contains_str(
+			report,
+			"/scoreboard/external_adapter_typed_non_pass_states_present",
+			state
+		)?);
+	}
+
+	Ok(())
 }
 
 fn assert_root_proactive_brief_summary(report: &Value) {
@@ -7747,6 +8061,7 @@ fn assert_root_aggregate_suites(report: &Value) -> Result<()> {
 		"knowledge_compilation",
 		"operator_debugging_ux",
 		"memory_evolution",
+		"adversarial_quality",
 		"core_archival_memory",
 	] {
 		let suite = find_by_field(suites, "/suite_id", suite_id)?;
@@ -7775,6 +8090,11 @@ fn assert_root_aggregate_suites(report: &Value) -> Result<()> {
 	assert_eq!(core_suite.pointer("/status").and_then(Value::as_str), Some("pass"));
 	assert_eq!(core_suite.pointer("/encoded_job_count").and_then(Value::as_u64), Some(6));
 
+	let adversarial = find_by_field(suites, "/suite_id", "adversarial_quality")?;
+
+	assert_eq!(adversarial.pointer("/status").and_then(Value::as_str), Some("pass"));
+	assert_eq!(adversarial.pointer("/encoded_job_count").and_then(Value::as_u64), Some(5));
+
 	let production_ops = find_by_field(suites, "/suite_id", "production_ops")?;
 
 	assert_eq!(production_ops.pointer("/status").and_then(Value::as_str), Some("blocked"));
@@ -7857,7 +8177,7 @@ fn assert_root_aggregate_jobs(report: &Value) -> Result<()> {
 fn real_world_memory_fixtures_report_aggregate_metrics() -> Result<()> {
 	let report = run_json_report_from(real_world_memory_fixture_dir())?;
 
-	assert_root_aggregate_summary(&report);
+	assert_root_aggregate_summary(&report)?;
 	assert_root_aggregate_suites(&report)?;
 	assert_root_aggregate_jobs(&report)?;
 
diff --git a/docs/runbook/benchmarking/real_world_agent_memory_benchmark.md b/docs/runbook/benchmarking/real_world_agent_memory_benchmark.md
index 50ee9317..b93f03b3 100644
--- a/docs/runbook/benchmarking/real_world_agent_memory_benchmark.md
+++ b/docs/runbook/benchmarking/real_world_agent_memory_benchmark.md
@@ -6,7 +6,7 @@ resource: docs/runbook/benchmarking/real_world_agent_memory_benchmark.md
 status: active
 authority: procedural
 owner: runbook
-last_verified: 2026-06-18
+last_verified: 2026-06-23
 tags:
   - docs
   - runbook
@@ -74,6 +74,7 @@ compile knowledge, and state honest uncertainty.
 | Personalization | Scoped preferences without cross-tenant leakage. | Apply the user's current preference and ignore another project's note. |
 | Core/archival memory | Always-loaded core memory behavior kept separate from archival note search. | Detect a stale core block and fall back to archival evidence. |
 | Context trajectory | Staged context trajectory, hierarchy selection, rejected sibling/decoy handling, and recursive expansion. | Block OpenViking trajectory scoring until same-corpus evidence ids and comparable stage artifacts exist. |
+| Adversarial quality | Quality-claim grammar under stale facts, unsupported claims, conflicting authority, private spans, and corrections. | Refuse a broad quality claim and preserve typed non-pass states instead of reporting a win. |
 
 ## External Reference Mapping
 
@@ -106,13 +107,24 @@ A real-world benchmark report must preserve typed outcomes:
 
 - `pass`
 - `wrong_result`
-- `lifecycle_fail`
 - `incomplete`
 - `blocked`
+- `not_tested`
 - `not_encoded`
 - `unsupported_claim`
 
-Do not collapse those terms into one leaderboard. `unsupported_claim` is especially
+The public quality scoreboard also reports evidence classes:
+
+- `fixture_backed`
+- `live_baseline`
+- `live_real_world`
+- `research_gate`
+
+Internal diagnostics may keep narrower terms such as `lifecycle_fail`, but the public
+scoreboard must expose typed public non-pass states instead of hiding them behind a
+single win/loss column. Do not collapse `wrong_result`, `incomplete`, `blocked`,
+`not_tested`, `not_encoded`, `unsupported_claim`, `fixture_backed`, `live_baseline`,
+or `research_gate` rows into one leaderboard. `unsupported_claim` is especially
 important: it means the system made a substantive claim that the corpus or evidence did
 not support. That is a different and higher-risk failure than simply missing a result.
 
@@ -189,25 +201,51 @@ including the retrieval-quality slice below. The suite currently encodes:
   stage-level trace readback for the same-corpus gate, missing staged artifact,
   selected hierarchy/rejected sibling gate, and recursive expansion/pruned-branch
   gate so a blocker is reviewable instead of a prose-only limitation.
+- `adversarial_quality`: stale-fact suppression, unsupported-claim refusal,
+  conflicting source authority selection, private/excluded span suppression, and
+  correction persistence. These fixtures gate the quality scoreboard grammar so
+  unsupported, stale, blocked, incomplete, wrong-result, and not-encoded behavior
+  cannot be counted as a win.
 - `p1_closeout` fixture slice: four jobs across the existing `consolidation`,
   `memory_evolution`, and `work_resume` suites for Source Library -> Memory Candidate
   -> approved memory -> recall/debug -> correction/rollback, stale decision
   suppression, unsupported-claim refusal, and work-resume next action.
 
-The generated report includes evidence coverage, source-ref coverage, quote coverage,
-unsupported-claim count, stale retrieval count, stale-answer count, conflict detection
-count, update rationale availability, temporal validity encoding count, scope
-correctness, redaction leak count, capture/integration behavior classes, Qdrant
-rebuild case/pass counts, expected evidence recall, irrelevant context ratio,
-latency/cost, answer-type plus caveat/refusal/uncertainty flags, and trace
-explainability counters, production-ops blocked/wrong-result job states, and
+The generated report includes the public quality scoreboard
+`elf.quality_scoreboard/v1`, encoded-job and external-adapter typed non-pass
+counts/states, aggregate typed non-pass counts/states, evidence-class counts, bounded
+job and aggregate summary claims, the unqualified-win guard, evidence coverage,
+source-ref coverage, quote coverage, unsupported-claim count, stale retrieval count,
+stale-answer count, conflict detection count, update rationale availability, temporal
+validity encoding count, scope correctness, redaction leak count, capture/integration
+behavior classes, Qdrant rebuild case/pass counts, expected evidence recall,
+irrelevant context ratio, latency/cost, answer-type plus
+caveat/refusal/uncertainty flags, trace explainability counters, production-ops
+blocked/wrong-result job states, and
 private-corpus redaction policy. The fixtures include negative traps for stale
 blockers, unsupported prior claims, stale deleted facts, stale historical facts,
 cross-project preference leakage, private/redacted text leakage, obsolete retrieval
 context, project-decision stale reuse, missing rationale, uncited current policy
 claims, overconfident unsupported decision answers, distractor context,
-index-only restore claims, private-corpus pass claims without a manifest, and
-checked-in credential leakage.
+index-only restore claims, private-corpus pass claims without a manifest, checked-in
+credential leakage, and adversarial stale or unsupported scoreboard claims.
+
+Current checked-in adversarial quality increment:
+
+```sh
+cargo make real-world-memory-adversarial-quality
+```
+
+This parses
+`apps/elf-eval/fixtures/real_world_memory/adversarial_quality/`, writes
+`tmp/real-world-memory/adversarial-quality/report.json`, and renders
+`tmp/real-world-memory/adversarial-quality/report.md`.
+
+The slice scores five fixture-backed jobs for stale fact suppression,
+unsupported-claim refusal, conflicting source authority, private/excluded spans, and
+correction persistence. The report is deliberately narrow: it proves that the
+scoreboard grammar and adversarial traps catch stale or unsupported behavior, not that
+ELF has a live-adapter, private-corpus, provider-backed, or broad competitor win.
 
 Current checked-in P1 closeout increment:
 
@@ -320,9 +358,17 @@ research gates. Its `external_adapters` report section distinguishes:
 - `research_gate`: checked-in source/setup/runtime/resource/retry metadata for a
   future adapter path, not fixture-backed or live execution evidence.
 
-Current fixture state: `cargo make real-world-memory-json` covers 66 jobs across 17
-suites, with 59 pass and 7 blocked. The P1 closeout fixture slice contributes four
-passing jobs for memory-authority closeout evidence. The `core_archival_memory` suite
+The public quality scoreboard renders the existing manifest evidence bucket
+`live_baseline_only` as the public evidence class `live_baseline`. When the default
+external adapter manifest is loaded, the scoreboard's typed non-pass count includes
+adapter coverage and scenario rows as well as fixture jobs.
+
+Current fixture state: `cargo make real-world-memory-json` covers 72 jobs across 18
+suites, with 65 pass and 7 blocked. The adversarial quality slice contributes five
+passing fixture-backed jobs that exercise stale fact suppression, unsupported-claim
+refusal, source-authority conflicts, private-span exclusion, and correction
+persistence. The P1 closeout fixture slice contributes four passing jobs for
+memory-authority closeout evidence. The `core_archival_memory` suite
 contributes six passing fixture jobs for core block attachment, scope, provenance,
 stale-core detection, archival fallback, and project-decision recovery. The
 `memory_summary` suite
diff --git a/docs/spec/real_world_agent_memory_benchmark_v1.md b/docs/spec/real_world_agent_memory_benchmark_v1.md
index 12b5213f..01360c73 100644
--- a/docs/spec/real_world_agent_memory_benchmark_v1.md
+++ b/docs/spec/real_world_agent_memory_benchmark_v1.md
@@ -64,6 +64,43 @@ evidence, traps, and scoring rubric first-class. A system can pass retrieval and
 fail a real-world job if it repeats completed work, cites obsolete evidence, omits a
 blocking caveat, or fabricates a decision that is not in the corpus.
 
+## Quality Scoreboard Grammar
+
+The public quality scoreboard is a claim grammar, not a leaderboard. Reports MUST use
+the grammar below when summarizing what is proven, what is not proven, and which
+evidence class supports the claim.
+
+Public result states:
+
+| State | Meaning |
+| --- | --- |
+| `pass` | The encoded job or suite ran to completion, met its threshold, satisfied required evidence, and hit no hard-fail rule. |
+| `wrong_result` | The runner reached the behavioral check but selected the wrong answer, wrong action, stale/current fact, or missed required evidence. |
+| `incomplete` | The runner or adapter did not reach the behavioral check because setup, wiring, parse, build, or runtime execution failed. |
+| `blocked` | The check cannot be run safely without credentials, manual setup, private input, durable product runtime, or host integration outside the run scope. |
+| `not_tested` | No benchmark execution or comparable adapter output exists for the row. |
+| `not_encoded` | The suite, job, adapter path, or scoring dimension is not implemented in the runner, so no pass/fail claim is allowed. |
+| `unsupported_claim` | The system or report made a substantive claim, decision, evidence citation, or capability claim that is not supported by the corpus, required evidence, or report metadata. |
+
+Public evidence classes:
+
+| Evidence class | Meaning |
+| --- | --- |
+| `fixture_backed` | Checked-in fixture evidence was scored. This is useful regression evidence, not live product execution. |
+| `live_baseline` | Docker live-baseline retrieval or lifecycle evidence exists. It is not a real-world suite pass by itself. |
+| `live_real_world` | A live adapter executed the real-world job contract and emitted typed outcomes. |
+| `research_gate` | Research, setup, source mapping, credential, or resource gates are recorded before a fair benchmark can run. |
+
+Report implementations MAY keep narrower internal diagnostic statuses such as
+`lifecycle_fail`, but public scoreboards MUST treat every non-`pass` diagnostic as a
+typed non-pass state. A report MUST NOT collapse `wrong_result`, `incomplete`,
+`blocked`, `not_tested`, `not_encoded`, `unsupported_claim`, `fixture_backed`,
+`live_baseline`, or `research_gate` rows into wins, parity, or proof of broad product
+quality. If any typed non-pass job or external-adapter row is present, the aggregate
+summary claim MUST remain a bounded statement such as `typed_non_pass_present`, not an
+unqualified win. Reports MAY also expose a separate encoded-job-only summary claim, but
+that narrower claim MUST NOT override the aggregate claim boundary.
+
 ## Real-World Job Schema
 
 A `real_world_job` record MUST include the fields below. JSON is the canonical exchange
@@ -586,6 +623,7 @@ Suite ids are stable public names. Each suite MUST contain at least one
 | `project_decisions` | Recover durable decisions, rationale, reversals, and current policy. | Explain why a design was chosen; distinguish old vs current validation gate; cite decision evidence. | Decision records, superseding events, accepted alternatives, current-policy timestamp. | answer_correctness, evidence_grounding, trap_avoidance, uncertainty_handling. | ELF, gbrain, llm-wiki, Letta. |
 | `retrieval` | Measure task-relevant retrieval quality beyond top-k keyword matching. | Answer a task query with expected evidence; find alternate phrasing; avoid near-duplicate project evidence. | Expected evidence ids, allowed alternates, decoy evidence ids, trace ids when available. | answer_correctness, evidence_grounding, trap_avoidance, latency_resource. | qmd, ELF, memsearch, OpenViking. |
 | `memory_evolution` | Verify updates, deletes, expiry, supersession, contradiction handling, and history. | Apply a new preference; suppress a deleted memory; explain what superseded an old fact. | Before/after memory versions, ingest decision rows or adapter history, current timeline event. | lifecycle_behavior, answer_correctness, evidence_grounding, trap_avoidance. | mem0, ELF, Graphiti/Zep, Letta. |
+| `adversarial_quality` | Verify quality-claim grammar under adversarial memory failures. | Suppress stale facts; refuse unsupported claims; choose authoritative current sources; exclude private spans; prove correction persistence. | Current and historical evidence ids, unsupported-claim traps, authority-ordering evidence, write-policy audit, correction and rollback readback. | answer_correctness, evidence_grounding, trap_avoidance, uncertainty_handling, lifecycle_behavior. | ELF, qmd, mem0/OpenMemory, Letta. |
 | `consolidation` | Test reviewable derived memory formation without hidden source mutation. | Produce a consolidation proposal; identify unsupported claims; discard stale synthesis. | Source inputs, derived proposal id, lineage, review state, conflict markers. | answer_correctness, evidence_grounding, uncertainty_handling, debuggability. | Claude Dreams, Gemini CLI Auto Memory, Always-On Memory Agent, ELF. |
 | `memory_summary` | Test reviewable top-of-mind, background, stale, superseded, tombstoned, and derived project-profile memory readback. | Produce a current memory summary; downgrade stale memory; expose a TTL tombstone; refuse an unsupported derived profile claim. | Summary entry source refs, freshness and validity markers, source trace, inclusion/downgrade/exclusion rationale, unsupported-claim flags. | answer_correctness, evidence_grounding, lifecycle_behavior, trap_avoidance, uncertainty_handling. | OpenAI Dreaming, Claude Dreams, Always-On Memory Agent, ELF. |
 | `knowledge_compilation` | Compile evidence into maintained project/entity/concept pages while preserving provenance. | Build a project status page; answer from compiled truth plus timeline; lint a stale page section. | Page section sources, backlinks, timeline entries, lint evidence. | answer_correctness, evidence_grounding, workflow_helpfulness, trap_avoidance. | llm-wiki, gbrain, graphify, OpenKB, ELF. |
@@ -612,6 +650,7 @@ Outcome terms:
 | `lifecycle_fail` | The answer surface may be correct for retrieval, but encoded update, delete, expiry, cold-start, persistence, history, or supersession behavior failed. |
 | `incomplete` | The runner could not reach the behavioral check because install, build, dependency, adapter wiring, parse, or runtime setup failed. |
 | `blocked` | The check cannot be run safely without credentials, manual setup, private corpus input, durable runtime integration, or host integration outside the run scope. |
+| `not_tested` | No benchmark execution or comparable adapter output exists for the row. |
 | `not_encoded` | The suite, job, adapter path, or scoring dimension is not implemented in the runner, so no pass/fail claim is allowed. |
 | `unsupported_claim` | The system produced a substantive claim, decision, evidence citation, or capability claim that is not supported by the job corpus, required evidence, or report metadata. |
 
@@ -634,6 +673,11 @@ Suite status rules:
 
 Reports MUST include:
 
+- quality scoreboard grammar using schema `elf.quality_scoreboard/v1`, including public
+  result states, evidence classes, encoded-job and external-adapter typed non-pass
+  counts, visible typed non-pass states for each bucket and the aggregate report,
+  evidence-class counts, bounded job and aggregate summary claims, and an explicit
+  unqualified-win guard;
 - run id, runner version, corpus profile, job ids, suite ids, project adapter metadata;
 - per-job status, normalized score, hard-fail hits, evidence ids used, trap ids used;
 - per-job `answer_type`, required caveat/refusal flags, and whether an unknown answer