diff --git a/src/description-length-gate.ts b/src/description-length-gate.ts new file mode 100644 index 0000000..8faeb9f --- /dev/null +++ b/src/description-length-gate.ts @@ -0,0 +1,244 @@ +import { gzipSync } from 'node:zlib' +import type { RunRecord } from './run-record' + +/** + * DescriptionLengthGate — a Minimum-Description-Length promotion gate, the + * Builder/Breaker acceptance rule from Wang & Buehler, "Self-Revising Discovery + * Systems for Science" (arXiv:2606.01444, MIT LAMM), eq. 5: + * + * L(M, D) = L_model(M) + L_data(D | M) + * accept M' over M iff L(M', D∪E) < L(M, D∪E) + * + * Both candidate and baseline are scored on the SAME enlarged evidence set + * (every accumulated task — NOT just the held-out split), and the candidate is + * accepted only if it lowers the TOTAL bit cost. This is the gate's whole + * point and what distinguishes it from a monotone held-out delta: + * + * - L_model(M) — the candidate's own description length: the compressed size + * of its model text (a prompt, skill, profile, or symbolic model). A bigger + * model pays more bits. + * - L_data(D|M) — the residual: bits of "surprise" that the model did not + * simply succeed, −Σ_i log2(s_i) over the model's per-task score s_i. + * Perfect scores cost 0 bits; failure costs a lot (capped, not infinite). + * + * A candidate that merely memorizes new counterexamples grows L_model faster + * than it shrinks L_data and LOSES — a principled, complexity-penalized + * alternative to HeldOutGate's held-out paired delta. Use this gate + * when the model text whose size you want to penalize is available; use + * HeldOutGate when promotion should turn on held-out generalization with an + * overfit-gap check instead. + * + * Scale / calibration: a gzip'd prose model is hundreds–thousands of bits; + * a single task contributes at most −log2(scoreFloor) data bits (≈10). So with + * little evidence the model term dominates and the gate is conservative about + * model GROWTH — it promotes a larger model only once accumulated evidence + * genuinely pays for the added bits (exactly the paper's regime, where D∪E + * grows). `lambda` is the lever: λ<1 discounts model bits (more permissive), + * λ>1 is stricter. A shrinking-or-equal model that does no worse always wins. + * + * Stateless: construct once with the description-length budget, call + * `evaluate` per (candidate, baseline) pair. + */ + +export type DescriptionLengthRejectionCode = 'few_tasks' | 'no_total_gain' | 'model_bloat' + +export interface DescriptionLengthConfig { + /** Stable label of the baseline. Required — paper-grade evaluation never + * compares two unlabelled candidates. */ + baselineKey: string + /** Weight on model bits relative to data bits (the description-length + * budget λ). 1 = bits are bits. >1 = more complexity-averse. Default 1. */ + lambda?: number + /** The candidate must beat the baseline by at least this many bits to + * promote — a robustness margin against measurement noise. Default 0 + * (strict `<`, as the paper). */ + marginBits?: number + /** Per-task score floor for the residual code: −log2(max(s, floor)). Caps a + * total-failure task's surprise instead of letting it diverge. Default + * 2^-10 (a failed task costs 10 bits, not ∞). */ + scoreFloor?: number + /** Minimum number of shared (candidate, baseline) tasks before the gate will + * consider promoting. Default 3. */ + minTasks?: number +} + +export interface DescriptionLengthEvidence { + /** Shared tasks scored on both sides (the enlarged evidence D∪E). */ + tasks: number + /** Compressed-model bits — L_model. */ + modelBits: { candidate: number; baseline: number } + /** Residual surprise bits — L_data(D|M). */ + dataBits: { candidate: number; baseline: number } + /** λ·L_model + L_data — the quantity the gate minimizes. */ + totalBits: { candidate: number; baseline: number } + /** candidate − baseline total. Negative = candidate compresses better. */ + deltaBits: number + /** Per-component deltas, for audit: did the win come from a smaller model, + * better outcomes, or both? */ + modelBitsDelta: number + dataBitsDelta: number +} + +export interface DescriptionLengthDecision { + promote: boolean + candidateId: string + baselineId: string + evidence: DescriptionLengthEvidence + reason: string + rejectionCode: DescriptionLengthRejectionCode | null +} + +export interface DescriptionLengthCandidate { + /** The model text whose size is L_model (a prompt, skill, profile, or + * symbolic model; concatenated if several files). */ + content: string + /** Runs whose per-task scores form L_data. */ + runs: RunRecord[] +} + +/** Score a single run, preferring the held-out score, then search, then the + * raw `score` metric. Returns undefined when the run carries no score. */ +function runScore(run: RunRecord): number | undefined { + const o = run.outcome + const s = o.holdoutScore ?? o.searchScore ?? o.raw?.score + return typeof s === 'number' && Number.isFinite(s) ? s : undefined +} + +/** Pairing key — the same scenario/experiment identity HeldOutGate pairs on. */ +function taskKey(run: RunRecord): string { + return run.scenarioId ?? run.experimentId +} + +/** Mean per-task score for a model: { taskKey -> mean(score over its runs) }. */ +function perTaskMeanScore(runs: RunRecord[]): Map { + const acc = new Map() + for (const run of runs) { + const s = runScore(run) + if (s === undefined) continue + const key = taskKey(run) + const cur = acc.get(key) ?? { sum: 0, n: 0 } + cur.sum += s + cur.n += 1 + acc.set(key, cur) + } + return new Map([...acc].map(([k, v]) => [k, v.sum / v.n])) +} + +/** Compressed-model bits — the model's description length L_model. gzip is a + * deterministic, dependency-free stand-in for Kolmogorov complexity; it + * rewards genuine compactness and penalizes boilerplate padding. */ +export function modelDescriptionBits(content: string): number { + return gzipSync(Buffer.from(content, 'utf8')).byteLength * 8 +} + +/** Residual surprise L_data(D|M) = −Σ_i log2(max(s_i, floor)) over the given + * per-task scores. Lower = the model more reliably succeeds. */ +export function dataDescriptionBits( + scoreByTask: Map, + keys: Iterable, + scoreFloor: number, +): number { + let bits = 0 + for (const key of keys) { + const s = scoreByTask.get(key) + if (s === undefined) continue + bits += -Math.log2(Math.max(s, scoreFloor)) + } + return bits +} + +export class DescriptionLengthGate { + private readonly baselineKey: string + private readonly lambda: number + private readonly marginBits: number + private readonly scoreFloor: number + private readonly minTasks: number + + constructor(config: DescriptionLengthConfig) { + if (!config.baselineKey) throw new Error('DescriptionLengthGate: baselineKey is required') + this.baselineKey = config.baselineKey + this.lambda = config.lambda ?? 1 + this.marginBits = config.marginBits ?? 0 + this.scoreFloor = config.scoreFloor ?? 2 ** -10 + this.minTasks = config.minTasks ?? 3 + if (!(this.lambda >= 0)) throw new Error('DescriptionLengthGate: lambda must be ≥ 0') + if (!(this.scoreFloor > 0 && this.scoreFloor < 1)) + throw new Error('DescriptionLengthGate: scoreFloor must be in (0,1)') + } + + /** Decide whether `candidate` should replace `baseline`. Both are scored on + * the shared task set (the enlarged evidence); the candidate promotes only + * if λ·L_model + L_data is strictly lower by at least `marginBits`. */ + evaluate( + candidate: DescriptionLengthCandidate, + baseline: DescriptionLengthCandidate, + ): DescriptionLengthDecision { + const candidateId = inferCandidateId(candidate.runs, this.baselineKey) + const candScores = perTaskMeanScore(candidate.runs) + const baseScores = perTaskMeanScore(baseline.runs) + // Enlarged evidence = tasks scored on BOTH sides (paired, like the paper's + // "both models refit on the same accumulated evidence"). + const shared = [...candScores.keys()].filter((k) => baseScores.has(k)) + + const modelBits = { + candidate: this.lambda * modelDescriptionBits(candidate.content), + baseline: this.lambda * modelDescriptionBits(baseline.content), + } + const dataBits = { + candidate: dataDescriptionBits(candScores, shared, this.scoreFloor), + baseline: dataDescriptionBits(baseScores, shared, this.scoreFloor), + } + const totalBits = { + candidate: modelBits.candidate + dataBits.candidate, + baseline: modelBits.baseline + dataBits.baseline, + } + const evidence: DescriptionLengthEvidence = { + tasks: shared.length, + modelBits, + dataBits, + totalBits, + deltaBits: totalBits.candidate - totalBits.baseline, + modelBitsDelta: modelBits.candidate - modelBits.baseline, + dataBitsDelta: dataBits.candidate - dataBits.baseline, + } + const base = { candidateId, baselineId: this.baselineKey, evidence } + const fmt = (n: number) => n.toFixed(1) + + if (shared.length < this.minTasks) { + return { + ...base, + promote: false, + reason: `few_tasks: ${shared.length} shared task(s) < min ${this.minTasks}`, + rejectionCode: 'few_tasks', + } + } + if (!(evidence.deltaBits < -this.marginBits)) { + // No net compression. Name the cause: did the model bloat eat a real + // data gain, or was there no data gain at all? + const code: DescriptionLengthRejectionCode = + evidence.dataBitsDelta < 0 ? 'model_bloat' : 'no_total_gain' + const why = + code === 'model_bloat' + ? `model grew ${fmt(evidence.modelBitsDelta)} bits, outpacing a ${fmt(-evidence.dataBitsDelta)}-bit data gain` + : `outcomes did not improve (data Δ=${fmt(evidence.dataBitsDelta)} bits)` + return { + ...base, + promote: false, + reason: `${code}: total Δ=${fmt(evidence.deltaBits)} bits does not clear the ${fmt(this.marginBits)}-bit margin — ${why}`, + rejectionCode: code, + } + } + return { + ...base, + promote: true, + reason: `promote: total Δ=${fmt(evidence.deltaBits)} bits (model Δ=${fmt(evidence.modelBitsDelta)}, data Δ=${fmt(evidence.dataBitsDelta)}) over ${shared.length} tasks`, + rejectionCode: null, + } + } +} + +function inferCandidateId(runs: RunRecord[], baselineKey: string): string { + for (const run of runs) + if (run.candidateId && run.candidateId !== baselineKey) return run.candidateId + return runs[0]?.candidateId ?? '(unknown candidate)' +} diff --git a/src/index.ts b/src/index.ts index 1f7f9ba..b08db12 100644 --- a/src/index.ts +++ b/src/index.ts @@ -998,6 +998,18 @@ export type { export { runCanaries } from './canary' // ── Concurrency + persistence + telemetry primitives for evolution loops ── export { Mutex } from './concurrency' +export type { + DescriptionLengthCandidate, + DescriptionLengthConfig, + DescriptionLengthDecision, + DescriptionLengthEvidence, + DescriptionLengthRejectionCode, +} from './description-length-gate' +export { + DescriptionLengthGate, + dataDescriptionBits, + modelDescriptionBits, +} from './description-length-gate' export type { DiscoveredPersona, DiscoverPersonasOptions, diff --git a/tests/description-length-gate.test.ts b/tests/description-length-gate.test.ts new file mode 100644 index 0000000..267fcc2 --- /dev/null +++ b/tests/description-length-gate.test.ts @@ -0,0 +1,139 @@ +import { createHash } from 'node:crypto' +import { describe, expect, it } from 'vitest' +import { + type DescriptionLengthCandidate, + DescriptionLengthGate, + dataDescriptionBits, + modelDescriptionBits, +} from '../src/description-length-gate' +import type { RunRecord } from '../src/run-record' + +function record(candidateId: string, task: string, score: number): RunRecord { + return { + runId: `${candidateId}-${task}`, + experimentId: `exp:${task}`, + candidateId, + seed: 0, + model: 'claude-sonnet-4-6@2025-04-15', + promptHash: 'p'.repeat(64), + configHash: 'c'.repeat(64), + commitSha: 'deadbeef', + wallMs: 1000, + costUsd: 0, + tokenUsage: { input: 100, output: 100 }, + outcome: { holdoutScore: score, raw: { score } }, + splitTag: 'holdout', + scenarioId: task, + } +} + +function candidate( + id: string, + content: string, + scores: Record, +): DescriptionLengthCandidate { + return { content, runs: Object.entries(scores).map(([task, s]) => record(id, task, s)) } +} + +const small = 'follow the protocol exactly; verify before finishing.' + +describe('description-length primitives', () => { + it('model bits grow with model size', () => { + expect(modelDescriptionBits(small.repeat(50))).toBeGreaterThan(modelDescriptionBits(small)) + }) + + it('data bits fall as scores rise; perfect scores cost ~0', () => { + const keys = ['a', 'b', 'c'] + const perfect = new Map(keys.map((k) => [k, 1])) + const mediocre = new Map(keys.map((k) => [k, 0.5])) + expect(dataDescriptionBits(perfect, keys, 2 ** -10)).toBeCloseTo(0, 6) + expect(dataDescriptionBits(mediocre, keys, 2 ** -10)).toBeCloseTo(3, 6) // each 0.5 task = 1 bit + }) + + it('a failed task is capped by the score floor, not infinite', () => { + const bits = dataDescriptionBits(new Map([['a', 0]]), ['a'], 2 ** -10) + expect(bits).toBeCloseTo(10, 6) // −log2(2^-10) + expect(Number.isFinite(bits)).toBe(true) + }) +}) + +describe('DescriptionLengthGate', () => { + const tasks = { a: 0.5, b: 0.5, c: 0.5, d: 0.5 } + + it('promotes a compact candidate that improves outcomes', () => { + const gate = new DescriptionLengthGate({ baselineKey: 'baseline' }) + const decision = gate.evaluate( + candidate('cand', small, { a: 0.9, b: 0.9, c: 0.9, d: 0.9 }), + candidate('baseline', small, tasks), + ) + expect(decision.promote).toBe(true) + expect(decision.evidence.dataBitsDelta).toBeLessThan(0) + expect(decision.rejectionCode).toBeNull() + }) + + it('rejects a model that improves outcomes but bloats — the MDL anti-overfit core', () => { + const gate = new DescriptionLengthGate({ baselineKey: 'baseline' }) + // Genuine bloat must be high-entropy — gzip correctly sees through + // low-entropy repetition. Incompressible hex stands in for memorization. + const entropy = Array.from({ length: 30 }, (_, i) => + createHash('sha256').update(`b${i}`).digest('hex'), + ).join('') + const bloated = `${small} ${entropy}` + const decision = gate.evaluate( + candidate('cand', bloated, { a: 0.55, b: 0.55, c: 0.55, d: 0.55 }), + candidate('baseline', small, tasks), + ) + expect(decision.promote).toBe(false) + expect(decision.rejectionCode).toBe('model_bloat') + expect(decision.evidence.dataBitsDelta).toBeLessThan(0) // outcomes DID improve + expect(decision.evidence.modelBitsDelta).toBeGreaterThan(0) // but the model grew more + }) + + it('rejects when outcomes do not improve', () => { + const gate = new DescriptionLengthGate({ baselineKey: 'baseline' }) + const decision = gate.evaluate( + candidate('cand', small, { a: 0.4, b: 0.4, c: 0.4, d: 0.4 }), + candidate('baseline', small, tasks), + ) + expect(decision.promote).toBe(false) + expect(decision.rejectionCode).toBe('no_total_gain') + }) + + const bigger = `${small} additionally, double-check each edge case named in the tests.` + const manyTasks = (score: number) => + Object.fromEntries(Array.from({ length: 350 }, (_, i) => [`t${i}`, score])) + + it('a bigger model wins once enough evidence justifies its bits', () => { + const gate = new DescriptionLengthGate({ baselineKey: 'baseline' }) + const decision = gate.evaluate( + candidate('cand', bigger, manyTasks(1)), + candidate('baseline', small, manyTasks(0.5)), + ) + expect(decision.evidence.modelBitsDelta).toBeGreaterThan(0) + expect(decision.evidence.dataBitsDelta).toBeLessThan(-decision.evidence.modelBitsDelta) + expect(decision.promote).toBe(true) + }) + + it('lambda scales the model penalty: doubling it rejects the same bigger model', () => { + const cand = candidate('cand', bigger, manyTasks(1)) + const base = candidate('baseline', small, manyTasks(0.5)) + expect( + new DescriptionLengthGate({ baselineKey: 'baseline', lambda: 1 }).evaluate(cand, base) + .promote, + ).toBe(true) + expect( + new DescriptionLengthGate({ baselineKey: 'baseline', lambda: 2 }).evaluate(cand, base) + .promote, + ).toBe(false) + }) + + it('refuses to promote below the task floor', () => { + const gate = new DescriptionLengthGate({ baselineKey: 'baseline', minTasks: 3 }) + const decision = gate.evaluate( + candidate('cand', small, { a: 0.9, b: 0.9 }), + candidate('baseline', small, { a: 0.5, b: 0.5 }), + ) + expect(decision.promote).toBe(false) + expect(decision.rejectionCode).toBe('few_tasks') + }) +})