diff --git a/src/description-length-gate.ts b/src/description-length-gate.ts
new file mode 100644
index 0000000..8faeb9f
--- /dev/null
+++ b/src/description-length-gate.ts
@@ -0,0 +1,244 @@
+import { gzipSync } from 'node:zlib'
+import type { RunRecord } from './run-record'
+
+/**
+ * DescriptionLengthGate — a Minimum-Description-Length promotion gate, the
+ * Builder/Breaker acceptance rule from Wang & Buehler, "Self-Revising Discovery
+ * Systems for Science" (arXiv:2606.01444, MIT LAMM), eq. 5:
+ *
+ *     L(M, D) = L_model(M) + L_data(D | M)
+ *     accept M' over M  iff  L(M', D∪E) < L(M, D∪E)
+ *
+ * Both candidate and baseline are scored on the SAME enlarged evidence set
+ * (every accumulated task — NOT just the held-out split), and the candidate is
+ * accepted only if it lowers the TOTAL bit cost. This is the gate's whole
+ * point and what distinguishes it from a monotone held-out delta:
+ *
+ *   - L_model(M)  — the candidate's own description length: the compressed size
+ *     of its model text (a prompt, skill, profile, or symbolic model). A bigger
+ *     model pays more bits.
+ *   - L_data(D|M) — the residual: bits of "surprise" that the model did not
+ *     simply succeed, −Σ_i log2(s_i) over the model's per-task score s_i.
+ *     Perfect scores cost 0 bits; failure costs a lot (capped, not infinite).
+ *
+ * A candidate that merely memorizes new counterexamples grows L_model faster
+ * than it shrinks L_data and LOSES — a principled, complexity-penalized
+ * alternative to HeldOutGate's held-out paired delta. Use this gate
+ * when the model text whose size you want to penalize is available; use
+ * HeldOutGate when promotion should turn on held-out generalization with an
+ * overfit-gap check instead.
+ *
+ * Scale / calibration: a gzip'd prose model is hundreds–thousands of bits;
+ * a single task contributes at most −log2(scoreFloor) data bits (≈10). So with
+ * little evidence the model term dominates and the gate is conservative about
+ * model GROWTH — it promotes a larger model only once accumulated evidence
+ * genuinely pays for the added bits (exactly the paper's regime, where D∪E
+ * grows). `lambda` is the lever: λ<1 discounts model bits (more permissive),
+ * λ>1 is stricter. A shrinking-or-equal model that does no worse always wins.
+ *
+ * Stateless: construct once with the description-length budget, call
+ * `evaluate` per (candidate, baseline) pair.
+ */
+
+export type DescriptionLengthRejectionCode = 'few_tasks' | 'no_total_gain' | 'model_bloat'
+
+export interface DescriptionLengthConfig {
+  /** Stable label of the baseline. Required — paper-grade evaluation never
+   *  compares two unlabelled candidates. */
+  baselineKey: string
+  /** Weight on model bits relative to data bits (the description-length
+   *  budget λ). 1 = bits are bits. >1 = more complexity-averse. Default 1. */
+  lambda?: number
+  /** The candidate must beat the baseline by at least this many bits to
+   *  promote — a robustness margin against measurement noise. Default 0
+   *  (strict `<`, as the paper). */
+  marginBits?: number
+  /** Per-task score floor for the residual code: −log2(max(s, floor)). Caps a
+   *  total-failure task's surprise instead of letting it diverge. Default
+   *  2^-10 (a failed task costs 10 bits, not ∞). */
+  scoreFloor?: number
+  /** Minimum number of shared (candidate, baseline) tasks before the gate will
+   *  consider promoting. Default 3. */
+  minTasks?: number
+}
+
+export interface DescriptionLengthEvidence {
+  /** Shared tasks scored on both sides (the enlarged evidence D∪E). */
+  tasks: number
+  /** Compressed-model bits — L_model. */
+  modelBits: { candidate: number; baseline: number }
+  /** Residual surprise bits — L_data(D|M). */
+  dataBits: { candidate: number; baseline: number }
+  /** λ·L_model + L_data — the quantity the gate minimizes. */
+  totalBits: { candidate: number; baseline: number }
+  /** candidate − baseline total. Negative = candidate compresses better. */
+  deltaBits: number
+  /** Per-component deltas, for audit: did the win come from a smaller model,
+   *  better outcomes, or both? */
+  modelBitsDelta: number
+  dataBitsDelta: number
+}
+
+export interface DescriptionLengthDecision {
+  promote: boolean
+  candidateId: string
+  baselineId: string
+  evidence: DescriptionLengthEvidence
+  reason: string
+  rejectionCode: DescriptionLengthRejectionCode | null
+}
+
+export interface DescriptionLengthCandidate {
+  /** The model text whose size is L_model (a prompt, skill, profile, or
+   *  symbolic model; concatenated if several files). */
+  content: string
+  /** Runs whose per-task scores form L_data. */
+  runs: RunRecord[]
+}
+
+/** Score a single run, preferring the held-out score, then search, then the
+ *  raw `score` metric. Returns undefined when the run carries no score. */
+function runScore(run: RunRecord): number | undefined {
+  const o = run.outcome
+  const s = o.holdoutScore ?? o.searchScore ?? o.raw?.score
+  return typeof s === 'number' && Number.isFinite(s) ? s : undefined
+}
+
+/** Pairing key — the same scenario/experiment identity HeldOutGate pairs on. */
+function taskKey(run: RunRecord): string {
+  return run.scenarioId ?? run.experimentId
+}
+
+/** Mean per-task score for a model: { taskKey -> mean(score over its runs) }. */
+function perTaskMeanScore(runs: RunRecord[]): Map<string, number> {
+  const acc = new Map<string, { sum: number; n: number }>()
+  for (const run of runs) {
+    const s = runScore(run)
+    if (s === undefined) continue
+    const key = taskKey(run)
+    const cur = acc.get(key) ?? { sum: 0, n: 0 }
+    cur.sum += s
+    cur.n += 1
+    acc.set(key, cur)
+  }
+  return new Map([...acc].map(([k, v]) => [k, v.sum / v.n]))
+}
+
+/** Compressed-model bits — the model's description length L_model. gzip is a
+ *  deterministic, dependency-free stand-in for Kolmogorov complexity; it
+ *  rewards genuine compactness and penalizes boilerplate padding. */
+export function modelDescriptionBits(content: string): number {
+  return gzipSync(Buffer.from(content, 'utf8')).byteLength * 8
+}
+
+/** Residual surprise L_data(D|M) = −Σ_i log2(max(s_i, floor)) over the given
+ *  per-task scores. Lower = the model more reliably succeeds. */
+export function dataDescriptionBits(
+  scoreByTask: Map<string, number>,
+  keys: Iterable<string>,
+  scoreFloor: number,
+): number {
+  let bits = 0
+  for (const key of keys) {
+    const s = scoreByTask.get(key)
+    if (s === undefined) continue
+    bits += -Math.log2(Math.max(s, scoreFloor))
+  }
+  return bits
+}
+
+export class DescriptionLengthGate {
+  private readonly baselineKey: string
+  private readonly lambda: number
+  private readonly marginBits: number
+  private readonly scoreFloor: number
+  private readonly minTasks: number
+
+  constructor(config: DescriptionLengthConfig) {
+    if (!config.baselineKey) throw new Error('DescriptionLengthGate: baselineKey is required')
+    this.baselineKey = config.baselineKey
+    this.lambda = config.lambda ?? 1
+    this.marginBits = config.marginBits ?? 0
+    this.scoreFloor = config.scoreFloor ?? 2 ** -10
+    this.minTasks = config.minTasks ?? 3
+    if (!(this.lambda >= 0)) throw new Error('DescriptionLengthGate: lambda must be ≥ 0')
+    if (!(this.scoreFloor > 0 && this.scoreFloor < 1))
+      throw new Error('DescriptionLengthGate: scoreFloor must be in (0,1)')
+  }
+
+  /** Decide whether `candidate` should replace `baseline`. Both are scored on
+   *  the shared task set (the enlarged evidence); the candidate promotes only
+   *  if λ·L_model + L_data is strictly lower by at least `marginBits`. */
+  evaluate(
+    candidate: DescriptionLengthCandidate,
+    baseline: DescriptionLengthCandidate,
+  ): DescriptionLengthDecision {
+    const candidateId = inferCandidateId(candidate.runs, this.baselineKey)
+    const candScores = perTaskMeanScore(candidate.runs)
+    const baseScores = perTaskMeanScore(baseline.runs)
+    // Enlarged evidence = tasks scored on BOTH sides (paired, like the paper's
+    // "both models refit on the same accumulated evidence").
+    const shared = [...candScores.keys()].filter((k) => baseScores.has(k))
+
+    const modelBits = {
+      candidate: this.lambda * modelDescriptionBits(candidate.content),
+      baseline: this.lambda * modelDescriptionBits(baseline.content),
+    }
+    const dataBits = {
+      candidate: dataDescriptionBits(candScores, shared, this.scoreFloor),
+      baseline: dataDescriptionBits(baseScores, shared, this.scoreFloor),
+    }
+    const totalBits = {
+      candidate: modelBits.candidate + dataBits.candidate,
+      baseline: modelBits.baseline + dataBits.baseline,
+    }
+    const evidence: DescriptionLengthEvidence = {
+      tasks: shared.length,
+      modelBits,
+      dataBits,
+      totalBits,
+      deltaBits: totalBits.candidate - totalBits.baseline,
+      modelBitsDelta: modelBits.candidate - modelBits.baseline,
+      dataBitsDelta: dataBits.candidate - dataBits.baseline,
+    }
+    const base = { candidateId, baselineId: this.baselineKey, evidence }
+    const fmt = (n: number) => n.toFixed(1)
+
+    if (shared.length < this.minTasks) {
+      return {
+        ...base,
+        promote: false,
+        reason: `few_tasks: ${shared.length} shared task(s) < min ${this.minTasks}`,
+        rejectionCode: 'few_tasks',
+      }
+    }
+    if (!(evidence.deltaBits < -this.marginBits)) {
+      // No net compression. Name the cause: did the model bloat eat a real
+      // data gain, or was there no data gain at all?
+      const code: DescriptionLengthRejectionCode =
+        evidence.dataBitsDelta < 0 ? 'model_bloat' : 'no_total_gain'
+      const why =
+        code === 'model_bloat'
+          ? `model grew ${fmt(evidence.modelBitsDelta)} bits, outpacing a ${fmt(-evidence.dataBitsDelta)}-bit data gain`
+          : `outcomes did not improve (data Δ=${fmt(evidence.dataBitsDelta)} bits)`
+      return {
+        ...base,
+        promote: false,
+        reason: `${code}: total Δ=${fmt(evidence.deltaBits)} bits does not clear the ${fmt(this.marginBits)}-bit margin — ${why}`,
+        rejectionCode: code,
+      }
+    }
+    return {
+      ...base,
+      promote: true,
+      reason: `promote: total Δ=${fmt(evidence.deltaBits)} bits (model Δ=${fmt(evidence.modelBitsDelta)}, data Δ=${fmt(evidence.dataBitsDelta)}) over ${shared.length} tasks`,
+      rejectionCode: null,
+    }
+  }
+}
+
+function inferCandidateId(runs: RunRecord[], baselineKey: string): string {
+  for (const run of runs)
+    if (run.candidateId && run.candidateId !== baselineKey) return run.candidateId
+  return runs[0]?.candidateId ?? '(unknown candidate)'
+}
diff --git a/src/index.ts b/src/index.ts
index 1f7f9ba..b08db12 100644
--- a/src/index.ts
+++ b/src/index.ts
@@ -998,6 +998,18 @@ export type {
 export { runCanaries } from './canary'
 // ── Concurrency + persistence + telemetry primitives for evolution loops ──
 export { Mutex } from './concurrency'
+export type {
+  DescriptionLengthCandidate,
+  DescriptionLengthConfig,
+  DescriptionLengthDecision,
+  DescriptionLengthEvidence,
+  DescriptionLengthRejectionCode,
+} from './description-length-gate'
+export {
+  DescriptionLengthGate,
+  dataDescriptionBits,
+  modelDescriptionBits,
+} from './description-length-gate'
 export type {
   DiscoveredPersona,
   DiscoverPersonasOptions,
diff --git a/tests/description-length-gate.test.ts b/tests/description-length-gate.test.ts
new file mode 100644
index 0000000..267fcc2
--- /dev/null
+++ b/tests/description-length-gate.test.ts
@@ -0,0 +1,139 @@
+import { createHash } from 'node:crypto'
+import { describe, expect, it } from 'vitest'
+import {
+  type DescriptionLengthCandidate,
+  DescriptionLengthGate,
+  dataDescriptionBits,
+  modelDescriptionBits,
+} from '../src/description-length-gate'
+import type { RunRecord } from '../src/run-record'
+
+function record(candidateId: string, task: string, score: number): RunRecord {
+  return {
+    runId: `${candidateId}-${task}`,
+    experimentId: `exp:${task}`,
+    candidateId,
+    seed: 0,
+    model: 'claude-sonnet-4-6@2025-04-15',
+    promptHash: 'p'.repeat(64),
+    configHash: 'c'.repeat(64),
+    commitSha: 'deadbeef',
+    wallMs: 1000,
+    costUsd: 0,
+    tokenUsage: { input: 100, output: 100 },
+    outcome: { holdoutScore: score, raw: { score } },
+    splitTag: 'holdout',
+    scenarioId: task,
+  }
+}
+
+function candidate(
+  id: string,
+  content: string,
+  scores: Record<string, number>,
+): DescriptionLengthCandidate {
+  return { content, runs: Object.entries(scores).map(([task, s]) => record(id, task, s)) }
+}
+
+const small = 'follow the protocol exactly; verify before finishing.'
+
+describe('description-length primitives', () => {
+  it('model bits grow with model size', () => {
+    expect(modelDescriptionBits(small.repeat(50))).toBeGreaterThan(modelDescriptionBits(small))
+  })
+
+  it('data bits fall as scores rise; perfect scores cost ~0', () => {
+    const keys = ['a', 'b', 'c']
+    const perfect = new Map(keys.map((k) => [k, 1]))
+    const mediocre = new Map(keys.map((k) => [k, 0.5]))
+    expect(dataDescriptionBits(perfect, keys, 2 ** -10)).toBeCloseTo(0, 6)
+    expect(dataDescriptionBits(mediocre, keys, 2 ** -10)).toBeCloseTo(3, 6) // each 0.5 task = 1 bit
+  })
+
+  it('a failed task is capped by the score floor, not infinite', () => {
+    const bits = dataDescriptionBits(new Map([['a', 0]]), ['a'], 2 ** -10)
+    expect(bits).toBeCloseTo(10, 6) // −log2(2^-10)
+    expect(Number.isFinite(bits)).toBe(true)
+  })
+})
+
+describe('DescriptionLengthGate', () => {
+  const tasks = { a: 0.5, b: 0.5, c: 0.5, d: 0.5 }
+
+  it('promotes a compact candidate that improves outcomes', () => {
+    const gate = new DescriptionLengthGate({ baselineKey: 'baseline' })
+    const decision = gate.evaluate(
+      candidate('cand', small, { a: 0.9, b: 0.9, c: 0.9, d: 0.9 }),
+      candidate('baseline', small, tasks),
+    )
+    expect(decision.promote).toBe(true)
+    expect(decision.evidence.dataBitsDelta).toBeLessThan(0)
+    expect(decision.rejectionCode).toBeNull()
+  })
+
+  it('rejects a model that improves outcomes but bloats — the MDL anti-overfit core', () => {
+    const gate = new DescriptionLengthGate({ baselineKey: 'baseline' })
+    // Genuine bloat must be high-entropy — gzip correctly sees through
+    // low-entropy repetition. Incompressible hex stands in for memorization.
+    const entropy = Array.from({ length: 30 }, (_, i) =>
+      createHash('sha256').update(`b${i}`).digest('hex'),
+    ).join('')
+    const bloated = `${small} ${entropy}`
+    const decision = gate.evaluate(
+      candidate('cand', bloated, { a: 0.55, b: 0.55, c: 0.55, d: 0.55 }),
+      candidate('baseline', small, tasks),
+    )
+    expect(decision.promote).toBe(false)
+    expect(decision.rejectionCode).toBe('model_bloat')
+    expect(decision.evidence.dataBitsDelta).toBeLessThan(0) // outcomes DID improve
+    expect(decision.evidence.modelBitsDelta).toBeGreaterThan(0) // but the model grew more
+  })
+
+  it('rejects when outcomes do not improve', () => {
+    const gate = new DescriptionLengthGate({ baselineKey: 'baseline' })
+    const decision = gate.evaluate(
+      candidate('cand', small, { a: 0.4, b: 0.4, c: 0.4, d: 0.4 }),
+      candidate('baseline', small, tasks),
+    )
+    expect(decision.promote).toBe(false)
+    expect(decision.rejectionCode).toBe('no_total_gain')
+  })
+
+  const bigger = `${small} additionally, double-check each edge case named in the tests.`
+  const manyTasks = (score: number) =>
+    Object.fromEntries(Array.from({ length: 350 }, (_, i) => [`t${i}`, score]))
+
+  it('a bigger model wins once enough evidence justifies its bits', () => {
+    const gate = new DescriptionLengthGate({ baselineKey: 'baseline' })
+    const decision = gate.evaluate(
+      candidate('cand', bigger, manyTasks(1)),
+      candidate('baseline', small, manyTasks(0.5)),
+    )
+    expect(decision.evidence.modelBitsDelta).toBeGreaterThan(0)
+    expect(decision.evidence.dataBitsDelta).toBeLessThan(-decision.evidence.modelBitsDelta)
+    expect(decision.promote).toBe(true)
+  })
+
+  it('lambda scales the model penalty: doubling it rejects the same bigger model', () => {
+    const cand = candidate('cand', bigger, manyTasks(1))
+    const base = candidate('baseline', small, manyTasks(0.5))
+    expect(
+      new DescriptionLengthGate({ baselineKey: 'baseline', lambda: 1 }).evaluate(cand, base)
+        .promote,
+    ).toBe(true)
+    expect(
+      new DescriptionLengthGate({ baselineKey: 'baseline', lambda: 2 }).evaluate(cand, base)
+        .promote,
+    ).toBe(false)
+  })
+
+  it('refuses to promote below the task floor', () => {
+    const gate = new DescriptionLengthGate({ baselineKey: 'baseline', minTasks: 3 })
+    const decision = gate.evaluate(
+      candidate('cand', small, { a: 0.9, b: 0.9 }),
+      candidate('baseline', small, { a: 0.5, b: 0.5 }),
+    )
+    expect(decision.promote).toBe(false)
+    expect(decision.rejectionCode).toBe('few_tasks')
+  })
+})