From f1c9a9cdbc82e5749b5dfd69fd45c5fa9b9187c2 Mon Sep 17 00:00:00 2001 From: Claude Date: Sun, 24 May 2026 16:19:24 +0000 Subject: [PATCH 1/3] pi-extension: port auto-push and multi-line truncation fix from CLK harness MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two of the recent CLK harness PRs have a direct parallel in pi-extension: * push-on-commit + ahead counter (756723c). pi-extension already commits every clk_checkpoint / clk_merge call, but never pushes — a remote-backed Pi workspace silently accumulated local commits. - src/git.ts: hasRemote, commitsAhead, pushBestEffort (best-effort, never throws; mirrors clk_harness/git_ops.py). - src/tools.ts: pushIfEnabled helper called after clk_checkpoint and clk_merge. Gated on CLK_GITHUB_PUSH_ON_COMMIT=true to match the Python TUI; surfaces an ↑N ahead count on push failure or when auto-push is disabled but commits exist. - src/index.ts: /clk-doctor now reports the ahead count and warns when local commits haven't reached origin. * multi-line objective truncation (24f379b). idea.slice(0, 60) was being done before splitting on newlines, so a multi-line idea could leak a fragment of line 2 into the status bar. - src/index.ts: new firstLineShort helper, used at every ctx.ui.setStatus("clk-idea", …) site and in /clk-doctor. Tests: tests/git.test.ts covers no-remote/sync/unreachable cases for pushBestEffort and commitsAhead. tests/index.test.ts asserts firstLineShort returns single-line, capped output for multi-line input. --- pi-extension/src/git.ts | 64 +++++++++++++++++++++ pi-extension/src/index.ts | 30 ++++++++-- pi-extension/src/tools.ts | 40 +++++++++++++ pi-extension/tests/git.test.ts | 96 ++++++++++++++++++++++++++++++++ pi-extension/tests/index.test.ts | 16 +++++- 5 files changed, 241 insertions(+), 5 deletions(-) diff --git a/pi-extension/src/git.ts b/pi-extension/src/git.ts index a599dc8..d04d3d6 100644 --- a/pi-extension/src/git.ts +++ b/pi-extension/src/git.ts @@ -218,3 +218,67 @@ export async function saveAndSwitch( } await git(cwd, ["checkout", targetBranch], signal); } + +/** True when the repo has a remote with the given name. */ +export async function hasRemote( + cwd: string, + name = "origin", + signal?: AbortSignal, +): Promise { + try { + await git(cwd, ["remote", "get-url", name], signal); + return true; + } catch { + return false; + } +} + +/** + * Count of local commits not yet on the upstream tracked branch. Returns + * 0 on any failure (no remote, no upstream, detached HEAD, network down) + * so callers can use it directly as a UI counter. + */ +export async function commitsAhead( + cwd: string, + signal?: AbortSignal, +): Promise { + try { + await git(cwd, ["rev-parse", "--abbrev-ref", "--symbolic-full-name", "@{u}"], signal); + } catch { + return 0; + } + try { + const out = await git(cwd, ["rev-list", "--count", "@{u}..HEAD"], signal); + return Number.parseInt(out, 10) || 0; + } catch { + return 0; + } +} + +/** + * Best-effort `git push` — never throws. Returns `{ pushed: true }` on + * success, otherwise `{ pushed: false, reason }` with stderr-derived + * detail so the caller can surface a hint without writing its own + * error-handling. + */ +export async function pushBestEffort( + cwd: string, + remote = "origin", + branch?: string, + signal?: AbortSignal, +): Promise<{ pushed: boolean; reason?: string }> { + if (!(await hasRemote(cwd, remote, signal))) { + return { pushed: false, reason: "no remote configured" }; + } + const args = ["push", remote, branch ?? "HEAD"]; + try { + await git(cwd, args, signal); + return { pushed: true }; + } catch (err) { + const raw = (err as { stderr?: string }).stderr; + const reason = (typeof raw === "string" && raw.trim()) + ? raw.trim().split("\n").slice(-1)[0]?.slice(0, 200) + : (err as Error).message?.slice(0, 200); + return { pushed: false, reason: reason || "unknown error" }; + } +} diff --git a/pi-extension/src/index.ts b/pi-extension/src/index.ts index 5e3afd9..57abba0 100644 --- a/pi-extension/src/index.ts +++ b/pi-extension/src/index.ts @@ -14,7 +14,7 @@ import { appendProgress, isDone, } from "./state.js"; -import { ensureRepo } from "./git.js"; +import { ensureRepo, commitsAhead, hasRemote } from "./git.js"; import { clkChiefPrimer } from "./prompts.js"; import { registerClkTools } from "./tools.js"; import { registerSubagentTool, tmuxAvailable } from "./subagent.js"; @@ -23,6 +23,17 @@ import { classifyError, recoveryHint, withRetry } from "./errors.js"; const execFileAsync = promisify(execFile); +/** + * Return the first non-empty line of `s`, trimmed and truncated to `max` + * characters. Used for status-bar labels where a multi-line idea (or + * objective) would otherwise leak a fragment of line 2 into the status + * display — the same bug the Python TUI fixed in commit 24f379b. + */ +export function firstLineShort(s: string, max = 60): string { + const line = s.split("\n").find((l) => l.trim()); + return (line ?? s).trim().slice(0, max); +} + export default async function (pi: ExtensionAPI): Promise { installAbortBridges(pi); registerClkTools(pi); @@ -49,7 +60,7 @@ export default async function (pi: ExtensionAPI): Promise { "info", ); } else { - ctx.ui.setStatus("clk-idea", `idea: ${s.idea.slice(0, 60)}`); + ctx.ui.setStatus("clk-idea", `idea: ${firstLineShort(s.idea)}`); } if (s.roster) { ctx.ui.setStatus( @@ -200,7 +211,18 @@ export default async function (pi: ExtensionAPI): Promise { ); const idea = getState().idea; - findings.push(idea ? ` ✓ ok idea: ${idea.slice(0, 60)}` : " - info no idea captured yet"); + findings.push(idea ? ` ✓ ok idea: ${firstLineShort(idea)}` : " - info no idea captured yet"); + + // Unpushed-commits check — mirrors the Python TUI's ahead counter + // so the user knows when local checkpoints haven't reached origin. + if (repoOk && await hasRemote(ctx.cwd)) { + const ahead = await commitsAhead(ctx.cwd); + if (ahead > 0) { + findings.push(` ! warn ${ahead} commit(s) ahead of origin (auto-push only fires when CLK_GITHUB_PUSH_ON_COMMIT=true)`); + } else { + findings.push(" ✓ ok in sync with origin"); + } + } ctx.ui.notify(["CLK doctor:", ...findings].join("\n"), "info"); }, @@ -236,7 +258,7 @@ export default async function (pi: ExtensionAPI): Promise { { kind: "note", message: `idea captured: ${idea}` }, pi, ); - ctx.ui.setStatus("clk-idea", `idea: ${idea.slice(0, 60)}`); + ctx.ui.setStatus("clk-idea", `idea: ${firstLineShort(idea)}`); ctx.ui.setStatus("clk-run", "active"); ctx.ui.notify( "CLK run started. The chief is taking over. Esc cancels the current turn; /clk-abort ends the run.", diff --git a/pi-extension/src/tools.ts b/pi-extension/src/tools.ts index d2347c2..0bc29f6 100644 --- a/pi-extension/src/tools.ts +++ b/pi-extension/src/tools.ts @@ -11,10 +11,48 @@ import { checkoutBranch, mergeBranch, saveAndSwitch, + commitsAhead, + hasRemote, + pushBestEffort, } from "./git.js"; import { activeSignal, mergeSignals, endRun } from "./abort.js"; import { classifyError, looksRedacted, recoveryHint, withRetry } from "./errors.js"; +/** + * Push the latest commit to `origin` when the user opted in via + * `CLK_GITHUB_PUSH_ON_COMMIT=true` (same env var as the Python TUI). On + * success, updates the clk-git status to "synced". On failure (or when + * push isn't enabled but a remote exists), surfaces an `↑N` ahead count + * so the user knows how many local checkpoints haven't reached origin. + * Best-effort throughout — never throws. + */ +async function pushIfEnabled( + cwd: string, + setStatus: (key: string, value: string) => void, + signal?: AbortSignal, +): Promise { + try { + if (!(await hasRemote(cwd, "origin", signal))) return; + const pushOn = (process.env.CLK_GITHUB_PUSH_ON_COMMIT ?? "false").toLowerCase() === "true"; + if (pushOn) { + const res = await pushBestEffort(cwd, "origin", undefined, signal); + if (res.pushed) { + setStatus("clk-git", "synced"); + return; + } + const ahead = await commitsAhead(cwd, signal); + setStatus("clk-git", `↑${ahead} (push failed: ${res.reason ?? "unknown"})`); + return; + } + const ahead = await commitsAhead(cwd, signal); + if (ahead > 0) { + setStatus("clk-git", `↑${ahead} unpushed (set CLK_GITHUB_PUSH_ON_COMMIT=true to auto-push)`); + } + } catch { + /* best-effort — never block the tool result on push bookkeeping. */ + } +} + export function registerClkTools(pi: ExtensionAPI): void { pi.registerTool({ name: "clk_cast", @@ -161,6 +199,7 @@ export function registerClkTools(pi: ExtensionAPI): void { pi, ); ctx.ui.setStatus("clk-head", `HEAD: ${sha.slice(0, 8)}`); + await pushIfEnabled(ctx.cwd, ctx.ui.setStatus.bind(ctx.ui), sig); } return { content: [ @@ -354,6 +393,7 @@ export function registerClkTools(pi: ExtensionAPI): void { ); ctx.ui.setStatus("clk-branch", `merged → ${home}`); if (mergeHead) ctx.ui.setStatus("clk-head", `HEAD: ${mergeHead.slice(0, 8)}`); + await pushIfEnabled(ctx.cwd, ctx.ui.setStatus.bind(ctx.ui), sig); return { content: [{ type: "text", text: `merged ${featureBranch} into ${home}` }], details: { featureBranch, home, mergeHead }, diff --git a/pi-extension/tests/git.test.ts b/pi-extension/tests/git.test.ts index 6b05c99..5f39353 100644 --- a/pi-extension/tests/git.test.ts +++ b/pi-extension/tests/git.test.ts @@ -21,6 +21,9 @@ import { checkoutBranch, mergeBranch, saveAndSwitch, + hasRemote, + commitsAhead, + pushBestEffort, } from "../src/git.ts"; const execFileAsync = promisify(execFile); @@ -166,3 +169,96 @@ describe("branching", () => { assert.equal(await head(dir), baseSha); }); }); + +describe("remote / push / ahead", () => { + test("hasRemote is false on a fresh repo with no remote", async () => { + const dir = await mkdtemp(join(tmpdir(), "clk-remote-")); + try { + await ensureRepo(dir); + assert.equal(await hasRemote(dir), false); + // commitsAhead returns 0 when there's no upstream, never throws. + assert.equal(await commitsAhead(dir), 0); + } finally { + await rm(dir, { recursive: true, force: true }); + } + }); + + test("hasRemote is true after `git remote add`", async () => { + const dir = await mkdtemp(join(tmpdir(), "clk-remote2-")); + try { + await ensureRepo(dir); + await execFileAsync( + "git", ["remote", "add", "origin", "/tmp/nonexistent-bare.git"], { cwd: dir }, + ); + assert.equal(await hasRemote(dir), true); + } finally { + await rm(dir, { recursive: true, force: true }); + } + }); + + test("commitsAhead counts local commits not on upstream; pushBestEffort syncs", async () => { + const bare = await mkdtemp(join(tmpdir(), "clk-bare-")); + const work = await mkdtemp(join(tmpdir(), "clk-work-")); + try { + await execFileAsync("git", ["init", "--bare", "-q"], { cwd: bare }); + await ensureRepo(work); + await gitConfig(work, "user.name", "test"); + await gitConfig(work, "user.email", "test@clk.invalid"); + await disableSigning(work); + await writeFile(join(work, "seed.txt"), "seed"); + await checkpoint(work, "[clk] seed"); + // Wire the bare as origin and set upstream via the first push. + await execFileAsync("git", ["remote", "add", "origin", bare], { cwd: work }); + const branch = await currentBranch(work); + await execFileAsync("git", ["push", "-u", "origin", branch], { cwd: work }); + assert.equal(await commitsAhead(work), 0); + + // Make a new local commit; ahead becomes 1. + await writeFile(join(work, "next.txt"), "more"); + await checkpoint(work, "[clk] next"); + assert.equal(await commitsAhead(work), 1); + + // pushBestEffort should sync; ahead returns to 0. + const res = await pushBestEffort(work, "origin"); + assert.equal(res.pushed, true, `expected push to succeed, got ${JSON.stringify(res)}`); + assert.equal(await commitsAhead(work), 0); + } finally { + await rm(bare, { recursive: true, force: true }); + await rm(work, { recursive: true, force: true }); + } + }); + + test("pushBestEffort returns {pushed:false,reason} when the remote is unreachable", async () => { + const dir = await mkdtemp(join(tmpdir(), "clk-unreach-")); + try { + await ensureRepo(dir); + await gitConfig(dir, "user.name", "test"); + await gitConfig(dir, "user.email", "test@clk.invalid"); + await disableSigning(dir); + await writeFile(join(dir, "x.txt"), "x"); + await checkpoint(dir, "[clk] x"); + // Bogus path — push must fail, but pushBestEffort must NOT throw. + await execFileAsync( + "git", ["remote", "add", "origin", "/tmp/definitely-does-not-exist-bare.git"], + { cwd: dir }, + ); + const res = await pushBestEffort(dir, "origin"); + assert.equal(res.pushed, false); + assert.ok(res.reason && res.reason.length > 0, `expected a reason, got ${JSON.stringify(res)}`); + } finally { + await rm(dir, { recursive: true, force: true }); + } + }); + + test("pushBestEffort returns {pushed:false} cleanly when there is no remote", async () => { + const dir = await mkdtemp(join(tmpdir(), "clk-noremote-")); + try { + await ensureRepo(dir); + const res = await pushBestEffort(dir, "origin"); + assert.equal(res.pushed, false); + assert.match(res.reason ?? "", /no remote/i); + } finally { + await rm(dir, { recursive: true, force: true }); + } + }); +}); diff --git a/pi-extension/tests/index.test.ts b/pi-extension/tests/index.test.ts index 605a63a..c17a6cf 100644 --- a/pi-extension/tests/index.test.ts +++ b/pi-extension/tests/index.test.ts @@ -14,7 +14,7 @@ import { mkdtemp, rm } from "node:fs/promises"; import { tmpdir } from "node:os"; import { join } from "node:path"; -import clkExtension from "../src/index.ts"; +import clkExtension, { firstLineShort } from "../src/index.ts"; // --------------------------------------------------------------------------- // Fake pi.ExtensionAPI -- just enough surface for the extension to register @@ -106,6 +106,20 @@ describe("clkExtension default export", () => { } }); + test("firstLineShort returns only the first non-empty line, trimmed and capped", () => { + // Single line — returned verbatim up to the cap. + assert.equal(firstLineShort("hello world", 60), "hello world"); + // Multi-line — the second line must never leak into the status string. + assert.equal(firstLineShort("refactor X\n\nbecause Y", 60), "refactor X"); + // Leading blank lines are skipped so the first *content* line wins. + assert.equal(firstLineShort("\n\nactual idea\nmore", 60), "actual idea"); + // Long single line is truncated to max chars; no newline appears. + const long = "a".repeat(120); + const out = firstLineShort(long, 60); + assert.equal(out.length, 60); + assert.equal(out.includes("\n"), false); + }); + test("/clk command rejects empty idea with a warning", async () => { const { pi, commands } = makeFakePi(); await clkExtension(pi as any); From 05af6cd1017d780b1b063c07100bd49e25ecc9b2 Mon Sep 17 00:00:00 2001 From: Claude Date: Sun, 24 May 2026 16:28:23 +0000 Subject: [PATCH 2/3] pi-extension: port auto-consensus, quality re-dispatch, autoresearch, Ralph MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Ports the Python harness's orchestration loops into the TypeScript extension so the chief can drive real code-enforced fan-out instead of having to fan-out by emitting parallel clk_subagent calls and hoping it followed the prompt. src/quality.ts (new) Port of clk_harness/orchestration/response_quality.py. Pure regex / string scorer — no I/O, no provider calls. Detects empty bodies, refusal phrases, malformed ACTION / POST blocks, missing declared POST PRODUCES keys, low CONFIDENCE: values, and NEEDS_REVIEW: true. Exposes scoreResponse, repairHint, isRecoverable, summarise. src/consensus.ts (new) Two primitives, both with an injectable spawn function so tests can drive them without tmux / pi installed: * dispatchWithQuality — wraps a single spawnSubagent in the quality re-dispatch loop. Re-runs with a repair-preamble preface on every recoverable failure up to maxRetries. * runConsensus — fan-out N parallel tmux samples for the same task, score each, return all + the winner. Pool runner caps concurrent in-flight sessions via maxParallel. src/subagent.ts Exposes spawnSubagent + SpawnOptions so consensus.ts can call them. Behaviour unchanged. src/tools.ts (+428 LOC) Four new tools registered alongside the existing roster: * clk_subagent_quality — one subagent + quality re-rolls. * clk_consensus — N samples, scored, winner returned. * clk_autoresearch — researcher + critic alternation (iterations are recorded on progress.md). * clk_ralph — branch + consensus fan-out in one call; the chief then calls clk_merge or clk_revert based on validation. Each tool surfaces a structured details payload so the chief sees scores, attempts, and flags rather than just the winning text. src/prompts.ts Updated chief primer to direct the chief through the new tools (Dispatch tool quick reference, restated rules 3, 4, 5A). The old "emit 3-5 clk_subagent calls in the same message" guidance is replaced by "call clk_consensus" so fan-out is enforced in code, not by chief compliance. src/index.ts /clk-help lists every orchestration tool and notes the CLK_GITHUB_PUSH_ON_COMMIT auto-push behaviour landed in the prior commit. Tests: 24 new tests across quality.test.ts (happy paths, every failure mode, repairHint / isRecoverable / summarise) and consensus.test.ts (injected spawn covers ok / retry / max-retries / non-recoverable refusal / fan-out winner picking / sample clamping / error capture / maxParallel concurrency). index.test.ts and prompts.test.ts updated to assert the new tools are registered and named in the chief primer. All 94 tests pass, typecheck clean. --- pi-extension/package.json | 2 +- pi-extension/src/consensus.ts | 218 ++++++++++++++ pi-extension/src/index.ts | 13 +- pi-extension/src/prompts.ts | 136 ++++++--- pi-extension/src/quality.ts | 250 ++++++++++++++++ pi-extension/src/subagent.ts | 4 +- pi-extension/src/tools.ts | 428 +++++++++++++++++++++++++++ pi-extension/tests/consensus.test.ts | 213 +++++++++++++ pi-extension/tests/index.test.ts | 24 +- pi-extension/tests/prompts.test.ts | 11 +- pi-extension/tests/quality.test.ts | 126 ++++++++ 11 files changed, 1379 insertions(+), 46 deletions(-) create mode 100644 pi-extension/src/consensus.ts create mode 100644 pi-extension/src/quality.ts create mode 100644 pi-extension/tests/consensus.test.ts create mode 100644 pi-extension/tests/quality.test.ts diff --git a/pi-extension/package.json b/pi-extension/package.json index b897bea..17c5b92 100644 --- a/pi-extension/package.json +++ b/pi-extension/package.json @@ -10,7 +10,7 @@ }, "scripts": { "typecheck": "tsc --noEmit", - "test:unit": "tsx --test tests/errors.test.ts tests/prompts.test.ts tests/state.test.ts tests/git.test.ts tests/index.test.ts tests/runtime_smoke.test.ts tests/safety_nets.test.ts", + "test:unit": "tsx --test tests/errors.test.ts tests/prompts.test.ts tests/state.test.ts tests/git.test.ts tests/index.test.ts tests/runtime_smoke.test.ts tests/safety_nets.test.ts tests/quality.test.ts tests/consensus.test.ts", "test": "npm run test:unit", "test:strict": "npm run typecheck && npm run test:unit" }, diff --git a/pi-extension/src/consensus.ts b/pi-extension/src/consensus.ts new file mode 100644 index 0000000..1827a33 --- /dev/null +++ b/pi-extension/src/consensus.ts @@ -0,0 +1,218 @@ +/** + * Stochastic consensus + quality re-dispatch for clk_subagent. + * + * Two related primitives in one module: + * + * * dispatchWithQuality — wraps a single spawnSubagent call with the + * quality re-dispatch loop (port of agent.py + * _dispatch_with_quality_loop). Scores the output via quality.ts; + * when the verdict is recoverable, re-runs with a repair preamble + * up to `maxRetries` extra attempts. + * + * * runConsensus — fan-out N parallel tmux subagent samples for the + * same task, score each via quality.ts, return all samples plus the + * best (highest score, ok=true preferred). Port of + * agent.py _dispatch_auto_consensus, minus the chief-coalescing + * pass (the caller can choose to feed all samples back to the chief + * if it wants a synthesised answer; the typical case is "pick the + * winner and continue"). + * + * Both helpers are exposed as new clk tools in tools.ts so the chief + * can dispatch through them instead of raw clk_subagent. The chief + * prompt nudges it that way for any non-trivial work. + */ + +import { spawnSubagent as defaultSpawnSubagent, type SpawnOptions } from "./subagent.js"; +import { + scoreResponse, + repairHint, + isRecoverable, + summarise, + type ResponseQuality, + type ScoreOpts, +} from "./quality.js"; + +/** + * The signature of the function that actually spawns a subagent. + * Defaults to the real tmux-based implementation in subagent.ts; + * the tests inject a synchronous in-memory stub so they can run + * without tmux / pi available. + */ +export type SpawnFn = (opts: SpawnOptions) => Promise<{ output: string; sessionId: string }>; + +export interface QualityDispatchOptions extends SpawnOptions { + /** + * Extra spawn attempts after the initial one. Default 1 (so up to + * two total dispatches per call). Set to 0 to disable the loop. + */ + maxRetries?: number; + /** Scoring options forwarded to quality.scoreResponse. */ + scoreOpts?: ScoreOpts; + /** Called with a short status line on each retry. Optional. */ + onRetry?: (attempt: number, quality: ResponseQuality) => void; + /** + * Injectable spawn function — defaults to the real tmux-based + * spawnSubagent. Tests pass a stub. + */ + spawn?: SpawnFn; +} + +export interface QualityDispatchResult { + output: string; + sessionId: string; + quality: ResponseQuality; + attempts: number; +} + +/** + * Dispatch one subagent, score the output, re-dispatch with a repair + * preamble on recoverable failures. Returns the *last* run (which is + * either the first ok run, or the final attempt's run when retries + * ran out — callers inspect `quality.ok` to decide). + */ +export async function dispatchWithQuality( + opts: QualityDispatchOptions, +): Promise { + const maxRetries = Math.max(0, opts.maxRetries ?? 1); + const scoreOpts = opts.scoreOpts ?? {}; + const spawn = opts.spawn ?? defaultSpawnSubagent; + const baseTask = opts.task; + let currentTask = baseTask; + let attempt = 0; + let lastQuality: ResponseQuality = scoreResponse(""); + let lastOutput = ""; + let lastSessionId = ""; + while (true) { + attempt += 1; + const { output, sessionId } = await spawn({ + ...opts, + task: currentTask, + }); + lastOutput = output; + lastSessionId = sessionId; + lastQuality = scoreResponse(output, scoreOpts); + if (lastQuality.ok || !isRecoverable(lastQuality) || attempt > maxRetries) { + return { output, sessionId, quality: lastQuality, attempts: attempt }; + } + opts.onRetry?.(attempt, lastQuality); + currentTask = repairHint(lastQuality) + "\n\nOriginal task:\n" + baseTask; + } + // Unreachable. + return { + output: lastOutput, + sessionId: lastSessionId, + quality: lastQuality, + attempts: attempt, + }; +} + +export interface ConsensusSample { + index: number; + agent: string; + output: string; + sessionId: string; + quality: ResponseQuality; + /** Set when spawnSubagent threw before producing output. */ + error?: string; +} + +export interface ConsensusOptions extends Omit { + /** Number of parallel samples. Clamped to 1..6. Default 3. */ + samples?: number; + /** Max concurrent in-flight tmux sessions. Clamped to 1..samples. Default min(4, samples). */ + maxParallel?: number; + scoreOpts?: ScoreOpts; + /** + * Called with each sample's progress update. The fan-out wraps the + * tmux poll messages so the caller can stream them. + */ + onSample?: (index: number, message: string) => void; + /** Injectable spawn function — tests pass a stub. */ + spawn?: SpawnFn; +} + +export interface ConsensusResult { + best: ConsensusSample; + all: ConsensusSample[]; + /** Short human-readable winning rationale. */ + reason: string; +} + +function pickBest(samples: ConsensusSample[]): { winner: ConsensusSample; reason: string } { + if (samples.length === 0) { + throw new Error("runConsensus: no samples to pick from"); + } + // Prefer samples that came back with output, then highest quality + // score, tie-break on shorter output (less filler). + const sorted = [...samples].sort((a, b) => { + const aHas = a.error ? 0 : 1; + const bHas = b.error ? 0 : 1; + if (aHas !== bHas) return bHas - aHas; + if (a.quality.score !== b.quality.score) return b.quality.score - a.quality.score; + return a.output.length - b.output.length; + }); + const winner = sorted[0]!; + let reason = `sample #${winner.index} won: ${summarise(winner.quality)}`; + if (samples.length > 1) { + const scores = samples.map((s) => `#${s.index}=${s.quality.score.toFixed(2)}`).join(" "); + reason += ` (all: ${scores})`; + } + return { winner, reason }; +} + +/** + * Spawn N parallel subagent samples for the same task; score each; + * return them all plus the winner. Never throws — failed samples carry + * their error in `sample.error` and contribute a 0-score quality. + */ +export async function runConsensus(opts: ConsensusOptions): Promise { + const samples = Math.max(1, Math.min(6, Math.floor(opts.samples ?? 3))); + const maxParallel = Math.max(1, Math.min(samples, Math.floor(opts.maxParallel ?? Math.min(4, samples)))); + const scoreOpts = opts.scoreOpts ?? {}; + const spawn = opts.spawn ?? defaultSpawnSubagent; + + // Simple semaphore-style runner: launch up to `maxParallel` at a time. + const indices = Array.from({ length: samples }, (_, i) => i + 1); + const collected: ConsensusSample[] = []; + + const runOne = async (idx: number): Promise => { + try { + const { output, sessionId } = await spawn({ + agent: opts.agent, + task: opts.task, + preferredModel: opts.preferredModel, + cwd: opts.cwd, + signal: opts.signal, + onUpdate: (text) => opts.onSample?.(idx, text), + }); + const quality = scoreResponse(output, scoreOpts); + return { index: idx, agent: opts.agent, output, sessionId, quality }; + } catch (err) { + return { + index: idx, + agent: opts.agent, + output: "", + sessionId: "", + quality: scoreResponse(""), + error: (err as Error).message, + }; + } + }; + + // Pool: keep `maxParallel` in flight, drain as they complete. + let next = 0; + async function worker(): Promise { + while (next < indices.length) { + const myIdx = indices[next++]!; + const result = await runOne(myIdx); + collected.push(result); + } + } + const workers = Array.from({ length: maxParallel }, () => worker()); + await Promise.all(workers); + + // Stable order by sample index. + collected.sort((a, b) => a.index - b.index); + const { winner, reason } = pickBest(collected); + return { best: winner, all: collected, reason }; +} diff --git a/pi-extension/src/index.ts b/pi-extension/src/index.ts index 57abba0..c2a2e27 100644 --- a/pi-extension/src/index.ts +++ b/pi-extension/src/index.ts @@ -85,15 +85,26 @@ export default async function (pi: ExtensionAPI): Promise { " consensus, Ralph refinement, and autoresearch.", " /clk-abort End the current run. Preserves state for resume.", " /clk-help Show this list.", - " /clk-doctor Health-check tmux + git + workspace state.", + " /clk-doctor Health-check tmux + git + remote + workspace state.", " /clk-undo Preview the last CLK commit; `/clk-undo confirm`", " creates a new revert commit on top of it.", "", + "Orchestration tools the chief uses (you don't call these directly):", + " clk_cast / clk_subagent — roster + raw dispatch.", + " clk_subagent_quality — single dispatch + quality re-roll.", + " clk_consensus — N parallel samples, scored, winner returned.", + " clk_autoresearch — researcher + critic alternation.", + " clk_ralph — branch + consensus fan-out in one call.", + " clk_branch / clk_merge / clk_revert / clk_checkpoint — git plumbing.", + " clk_done — completion signal.", + "", "Safety nets active in this workspace:", " - Hardened .gitignore blocks .env / .env.bak / *.pem / id_rsa.", " - .git/hooks/pre-push aborts pushes containing API-key patterns.", " - .clk/state/*.{json,md} are written atomically with .bak rotation.", " - Each completed iteration is checkpointed with `git commit`.", + " - With CLK_GITHUB_PUSH_ON_COMMIT=true, every checkpoint auto-pushes", + " to origin (and falls back to an ↑N ahead counter when it can't).", "", "Re-read this anytime with /clk-help. If something looks stuck, the", "agent_end hook will report it; `/clk-doctor` triages provider and", diff --git a/pi-extension/src/prompts.ts b/pi-extension/src/prompts.ts index d7b45e7..fc615e4 100644 --- a/pi-extension/src/prompts.ts +++ b/pi-extension/src/prompts.ts @@ -10,10 +10,50 @@ export function clkChiefPrimer(idea: string): string { return ` You are the **CLK chief**, the orchestrating agent inside the Pi terminal harness. Your job is to take the captured idea, dynamically design a team of specialists, -dispatch them via the \`clk_subagent\` tool, and drive -the project to completion through repeated agentic cycles. Every meaningful -change is committed to git via the CLK extension's \`clk_checkpoint\` tool, so -no good work is ever lost. +dispatch them via the CLK tools below, and drive the project to completion +through repeated agentic cycles. Every meaningful change is committed to git via +the CLK extension's \`clk_checkpoint\` tool, so no good work is ever lost. + +## Dispatch tool quick reference + +You have four dispatch tools — pick the one that matches the situation: + +* \`clk_subagent({ agent, task, preferredModel? })\` — one subagent, no + quality gate. Use only for cheap, low-risk work where re-rolling is + pointless (e.g. simple file reads, status pings). +* \`clk_subagent_quality({ agent, task, maxRetries?, preferredModel? })\` — + one subagent **scored by the harness's quality detector**, with up to + \`maxRetries\` automatic repair re-rolls. Default everywhere a single + worker is enough but you want bad output caught before it propagates. +* \`clk_consensus({ agent, task, samples?, preferredModel? })\` — fan-out N + parallel samples (default 3, max 6), each scored, returns the winner + plus all candidates. Use **liberally** for any decision that benefits + from diverse independent attempts: architecture, design choices, + ambiguous requirements, validation verdicts, security/perf reviews, + reviewer/oracle synthesis steps. +* \`clk_autoresearch({ question, iterations?, preferredModel? })\` — + bounded research loop (default 2 iterations) that alternates a + \`researcher\` and a \`critic\` subagent and records each finding. Use + before non-trivial implementation work whenever the optimal approach + is unclear. + +For an entire Ralph iteration in one tool call: + +* \`clk_ralph({ iterationName, agent, task, samples?, preferredModel? })\` — + creates a fresh \`ralph/\` branch, dispatches a consensus + fan-out, and returns the winning output. You then EITHER call + \`clk_merge\` (accept) or \`clk_revert\` (reject) based on validation. + Prefer this over manual \`clk_branch\` + dispatch + commit when running + iterative refinement — the branch creation and fan-out happen in one + step and can't be skipped. + +The harness scores every \`clk_consensus\`, \`clk_subagent_quality\`, and +\`clk_autoresearch\` output against the same rule set used by the Python +CLK harness (empty / refusal / malformed-block / low-confidence / missing +declared outputs). Recoverable failures auto-retry with a repair preamble +so your worker fixes the specific problems rather than re-rolling at +random. **Use the quality-gated tools as your default**; reserve raw +\`clk_subagent\` for genuinely throwaway work. ## Captured idea @@ -44,24 +84,36 @@ ${idea} decision-making mechanism for every meaningful choice: architecture, implementation approach, API contract, data model, security boundary, ambiguous requirement, risky refactor, and any time two or more - reasonable paths exist. Emit **3–5 \`clk_subagent\` tool calls in the same - assistant message**, each posing the question with a different framing, - prior, or role. Pi runs sibling tool calls concurrently by default, so - they fan out in parallel. Then in your next turn, emit ONE more - \`clk_subagent\` call to a judge (\`oracle\` or \`reviewer\`) that reads all - the candidates and picks or synthesizes the answer. Record the winner - with \`clk_progress({ kind: "consensus", message: "..." })\`. + reasonable paths exist. + + The harness ships a code-enforced fan-out tool — use it directly: + + clk_consensus({ + agent: "designer", + samples: 3, // or 5 for high-stakes decisions + task: "[Role: ...]\\n[Mission: ...]\\n\\nQuestion: ..." + }) + + \`clk_consensus\` spawns the N subagents in parallel via tmux, scores + each output through the harness's quality detector, and returns the + highest-scoring winner along with every candidate's score so you can + see the spread. If you need a synthesised answer rather than the + winner, follow with one \`clk_consensus\` call to an \`oracle\` or + \`reviewer\` whose task quotes all candidates and asks for a merged + verdict. Record the outcome with + \`clk_progress({ kind: "consensus", message: "..." })\`. **Encourage stochastic consensus at the start of every Ralph iteration**, not only when uncertainty is obvious. Even a quick 3-way fan-out on "what is the highest-value next improvement?" yields better choices than a - single-agent guess. + single-agent guess. The \`clk_ralph\` tool below already includes a + consensus fan-out by default. 4. **Refinement: Ralph loop — iterate until done.** Once an MVP exists and tests pass, enter a refinement loop and **keep looping without pausing for user input** until \`clk_done\` is called. Do not stop between iterations — immediately pick the next improvement and start the next - cycle. Each iteration follows this exact branch-based protocol: + cycle. Prefer the one-call \`clk_ralph\` form for each iteration: a. Pick ONE improvement (lowest-risk, highest-value). Classify it: - **Measurable** (has a numeric outcome): run rule 5B @@ -70,27 +122,33 @@ ${idea} authorised changes or the completion criteria are met. - **Qualitative** (design, architecture, unknown approach): run rule 5A first to resolve the open question, then - proceed with steps (b)–(h) below. - b. Create a feature branch: \`clk_branch({ name: - "ralph/iter-N-short-description" })\`. All work for this - iteration happens on that branch. - c. Dispatch a worker via \`clk_subagent\` to implement the improvement. - d. Call \`clk_checkpoint({ message: "ralph: " })\` - to commit the work to the feature branch. - e. Run the project's validation command (\`pytest -q\`, \`npm test\`, + proceed with steps (b)–(g) below. + b. \`clk_ralph({ iterationName: "iter-N-short-description", + agent: "engineer", task: "", samples: 3 })\` + The tool creates a fresh \`ralph/iter-N-short-description\` + branch, fans out 3 parallel subagent samples, scores them, + and returns the winning output. You read the winner and decide. + c. Call \`clk_checkpoint({ message: "ralph: " })\` + to commit any additional changes you made on top of the winner. + d. Run the project's validation command (\`pytest -q\`, \`npm test\`, etc.) via the built-in \`bash\` tool. - f. **If validation passes:** call \`clk_merge({ message: + e. **If validation passes:** call \`clk_merge({ message: "ralph win: " })\`. This commits any remaining changes, merges the feature branch into the home branch, and - returns you to the home branch. The accepted work is now on the home branch. + returns you to the home branch. Record with \`clk_progress({ kind: "ralph", message: "win: ..." })\`. - g. **If validation fails:** call \`clk_revert({ reason: "" })\`. This commits the rejected work to the feature branch (preserving it for review), then switches back to the home branch without merging. The rejected branch is never deleted. Record with \`clk_progress({ kind: "ralph", message: "rejected: ..." })\`. - h. Loop back to step (a) immediately for the next iteration. + g. Loop back to step (a) immediately for the next iteration. + + Manual branch / dispatch / commit is still allowed via \`clk_branch\` + + \`clk_subagent_quality\` + \`clk_checkpoint\` if you need more control; + the \`clk_ralph\` form is just the recommended default because it + can't accidentally skip the branch + fan-out steps. After every ~10 consecutive iterations pause to re-evaluate direction with consensus (rule 3). **Resume the loop immediately after @@ -108,19 +166,21 @@ ${idea} ### 5A. Qualitative autoresearch (open questions, design trade-offs, unknown library behaviour, ambiguous requirements) - Use Ralph-style parallel dispatch + stochastic consensus (rule 3): - a. State the open question precisely. - b. Fan out **3–5 \`clk_subagent\` calls in the same message**, each - exploring the question from a different angle — different - framing, different role, different prior. Use \`researcher\` - for external evidence, \`scout\` for code recon, \`worker\` for - a throwaway spike. They run concurrently. - c. In the next turn emit ONE \`oracle\` or \`reviewer\` call that - synthesizes all results and produces a decision. - d. Record with \`clk_progress({ kind: "autoresearch", message: - "qualitative: " })\`. - e. Apply immediately to the next Ralph iteration or architectural - decision. + Use the dedicated tool — it runs the researcher + critic alternation + in code, scores every output, and records each iteration on the + progress log: + + clk_autoresearch({ + question: "", + iterations: 2, // 1..5; bump to 3 for high-stakes + }) + + Then in your next turn either act on the consolidated findings (apply + to the next Ralph iteration / architectural decision) or, if the + answer is still uncertain, fan out a \`clk_consensus\` synthesis pass + with the autoresearch findings quoted into the task. Record with + \`clk_progress({ kind: "autoresearch", message: "qualitative: + → " })\`. ### 5B. Quantitative autoresearch (Karpathy autoresearch pattern) diff --git a/pi-extension/src/quality.ts b/pi-extension/src/quality.ts new file mode 100644 index 0000000..df9da69 --- /dev/null +++ b/pi-extension/src/quality.ts @@ -0,0 +1,250 @@ +/** + * Response-quality scorer — TypeScript port of + * clk_harness/orchestration/response_quality.py. + * + * Used by clk_consensus to pick the best of N stochastic samples and by + * the clk_subagent quality re-dispatch loop to detect (and re-roll on) + * empty / refused / malformed / low-confidence subagent outputs without + * a single extra provider call. All checks are pure string / regex + * operations. + * + * Mirrors the Python harness so a behaviour change in either side stays + * one diff away from a matching change in the other. + */ + +export interface ResponseQuality { + ok: boolean; + /** Rough 0..1 score, 1.0 = clean, lower = more flags / more severe. */ + score: number; + flags: string[]; + reasons: string[]; + /** + * False when the response should NOT be retried (an explicit refusal, + * for instance — re-rolling will just produce the same refusal). The + * caller is expected to escalate rather than retry in that case. + */ + recoverable: boolean; + /** CONFIDENCE: line value, if present and parseable. */ + confidence?: number; + /** NEEDS_REVIEW: true|false line value, if present. */ + needsReview?: boolean; +} + +export interface ScoreOpts { + /** Text shorter than this counts as "empty". Default 40. */ + minChars?: number; + /** + * When provided, every key must appear in some POST block's PRODUCES: + * list for the response to pass. Empty array disables the check. + */ + expectedOutputs?: string[]; + /** + * When true, missing the CONFIDENCE: line itself becomes a flag. + * Default false so existing prompts aren't penalised retroactively. + */ + requireConfidence?: boolean; +} + +const CONFIDENCE_RE = /^\s*CONFIDENCE\s*:\s*([0-9]*\.?[0-9]+)\s*$/im; +const NEEDS_REVIEW_RE = /^\s*NEEDS_REVIEW\s*:\s*(true|yes|y|1|false|no|n|0)\s*$/im; +const REFUSAL_RES: RegExp[] = [ + /\bi\s+cannot\b/i, + /\bi\s+can'?t\b\s+(?:help|assist|do|comply)/i, + /\bi\s+(?:am|'m)\s+(?:sorry|unable)\b.*\b(?:cannot|can'?t|won'?t)\b/i, + /\bas\s+an\s+ai\s+(?:language\s+)?model\b/i, + /\bI\s+do\s+not\s+have\s+the\s+ability\b/i, +]; +const HEADER_ACTION_RE = /^\s*ACTION\s*:\s*([A-Za-z]+)/gim; +const END_ACTION_RE = /^\s*END_ACTION\s*$/gim; +const POST_HEAD_RE = /^\s*POST\s*:\s*([A-Za-z][A-Za-z0-9_]*)\s*$/gim; +const POST_END_RE = /^\s*END_POST\s*$/gim; +const PRODUCES_RE = /^\s*PRODUCES\s*:\s*(.+)$/gim; + +function parseConfidence(text: string): number | undefined { + const m = CONFIDENCE_RE.exec(text); + if (!m) return undefined; + let v = Number.parseFloat(m[1]!); + if (Number.isNaN(v)) return undefined; + if (v < 0) v = 0; + if (v > 1) v = Math.min(1, v / 100); + return v; +} + +function parseNeedsReview(text: string): boolean | undefined { + const m = NEEDS_REVIEW_RE.exec(text); + if (!m) return undefined; + return ["true", "yes", "y", "1"].includes(m[1]!.toLowerCase()); +} + +function detectRefusal(text: string): boolean { + return REFUSAL_RES.some((re) => re.test(text)); +} + +function countMatches(re: RegExp, text: string): number { + // Global regexes need a fresh lastIndex on each call. + const fresh = new RegExp(re.source, re.flags); + let n = 0; + while (fresh.exec(text) !== null) n++; + return n; +} + +function actionBlockImbalance(text: string): number { + const heads = countMatches(HEADER_ACTION_RE, text); + if (heads === 0) return 0; + const ends = countMatches(END_ACTION_RE, text); + return heads - ends; +} + +function postBlockImbalance(text: string): number { + const heads = countMatches(POST_HEAD_RE, text); + if (heads === 0) return 0; + const ends = countMatches(POST_END_RE, text); + return heads - ends; +} + +function declaredProduces(text: string): Set { + const out = new Set(); + const fresh = new RegExp(PRODUCES_RE.source, PRODUCES_RE.flags); + let m: RegExpExecArray | null; + while ((m = fresh.exec(text)) !== null) { + for (const key of m[1]!.split(",")) { + const k = key.trim(); + if (k) out.add(k); + } + } + return out; +} + +function missingOutputs(text: string, expected: string[]): string[] { + if (expected.length === 0) return []; + const declared = declaredProduces(text); + return expected.filter((k) => !declared.has(k)); +} + +/** + * Score a single response text against the harness's quality rules. + * + * Always returns a `ResponseQuality` — never throws on a malformed + * input, so callers can use the score even when the upstream provider + * returned junk. + */ +export function scoreResponse( + text: string | null | undefined, + opts: ScoreOpts = {}, +): ResponseQuality { + const minChars = opts.minChars ?? 40; + const expected = opts.expectedOutputs ?? []; + const requireConfidence = opts.requireConfidence ?? false; + + const raw = text ?? ""; + const body = raw.trim(); + const flags: string[] = []; + const reasons: string[] = []; + let recoverable = true; + const confidence = parseConfidence(raw); + const needsReview = parseNeedsReview(raw); + + if (body.length < Math.max(1, minChars)) { + flags.push("empty"); + reasons.push( + `Response body was ${body.length} chars (minimum ${minChars}). Re-emit a substantive response.`, + ); + } + if (detectRefusal(raw)) { + flags.push("refusal"); + reasons.push( + "Response looked like a refusal. The task is in-scope for this harness; respond directly or, " + + "if blocked, explain the obstacle so the chief can re-cast or escalate.", + ); + recoverable = false; + } + const actBal = actionBlockImbalance(raw); + if (actBal > 0) { + flags.push("malformed_action"); + reasons.push( + `${actBal} ACTION header(s) had no matching END_ACTION. Every ACTION block must terminate with a line END_ACTION.`, + ); + } + const postBal = postBlockImbalance(raw); + if (postBal > 0) { + flags.push("malformed_post"); + reasons.push( + `${postBal} POST header(s) had no matching END_POST. Every POST block must terminate with a line END_POST.`, + ); + } + const missing = missingOutputs(raw, expected); + if (missing.length > 0) { + flags.push("outputs_missing"); + reasons.push( + "Declared output contract keys not satisfied: " + + missing.join(", ") + + ". Each key must appear in some POST block's PRODUCES: list.", + ); + } + if (confidence !== undefined && confidence < 0.5) { + flags.push("low_confidence"); + reasons.push( + `You reported CONFIDENCE: ${confidence.toFixed(2)}. Either improve the response or escalate.`, + ); + } + if (needsReview === true) { + flags.push("needs_review_self"); + reasons.push( + "You set NEEDS_REVIEW: true. Sharpen the answer or call out the specific uncertainty.", + ); + } + if (requireConfidence && confidence === undefined) { + flags.push("confidence_missing"); + reasons.push( + "Response did not include a CONFIDENCE: <0..1> line. Emit one final line stating your confidence so the harness can decide whether to re-sample.", + ); + } + + const deductions: Record = { + empty: 0.6, + refusal: 0.5, + malformed_action: 0.4, + malformed_post: 0.3, + outputs_missing: 0.4, + low_confidence: 0.3, + needs_review_self: 0.2, + confidence_missing: 0.1, + }; + let s = 1.0; + for (const f of flags) s -= deductions[f] ?? 0.2; + const score = Math.max(0, Math.round(s * 1000) / 1000); + + return { + ok: flags.length === 0, + score, + flags, + reasons, + recoverable, + confidence, + needsReview, + }; +} + +/** + * Build a re-dispatch preamble that names every flag so the worker + * fixes the specific issues instead of re-rolling at random. + */ +export function repairHint(q: ResponseQuality): string { + if (q.ok || q.reasons.length === 0) return ""; + const bullets = q.reasons.map((r) => `- ${r}`).join("\n"); + return ( + "Your previous response was rejected by the harness for the following reasons:\n" + + bullets + + "\nRe-emit a complete response that fixes every item above." + ); +} + +/** Convenience: is the verdict worth re-rolling on? */ +export function isRecoverable(q: ResponseQuality): boolean { + return !q.ok && q.recoverable; +} + +export function summarise(q: ResponseQuality): string { + if (q.ok) return `ok score=${q.score.toFixed(2)}`; + return `flags=${q.flags.join(",") || "?"} score=${q.score.toFixed(2)}`; +} diff --git a/pi-extension/src/subagent.ts b/pi-extension/src/subagent.ts index 552ecd5..20600c3 100644 --- a/pi-extension/src/subagent.ts +++ b/pi-extension/src/subagent.ts @@ -86,7 +86,7 @@ export async function killAllSubagentSessions(): Promise { ); } -interface SpawnOptions { +export interface SpawnOptions { agent: string; task: string; preferredModel?: string; @@ -95,7 +95,7 @@ interface SpawnOptions { onUpdate?: (text: string) => void; } -async function spawnSubagent(opts: SpawnOptions): Promise<{ output: string; sessionId: string }> { +export async function spawnSubagent(opts: SpawnOptions): Promise<{ output: string; sessionId: string }> { const sessionId = `clk-${randomUUID().slice(0, 8)}`; const dirPath = join(opts.cwd, ".clk", "subagents", sessionId); const taskPath = resolve(join(dirPath, "task.md")); diff --git a/pi-extension/src/tools.ts b/pi-extension/src/tools.ts index 0bc29f6..ec53782 100644 --- a/pi-extension/src/tools.ts +++ b/pi-extension/src/tools.ts @@ -17,6 +17,9 @@ import { } from "./git.js"; import { activeSignal, mergeSignals, endRun } from "./abort.js"; import { classifyError, looksRedacted, recoveryHint, withRetry } from "./errors.js"; +import { dispatchWithQuality, runConsensus } from "./consensus.js"; +import { tmuxAvailable } from "./subagent.js"; +import { summarise } from "./quality.js"; /** * Push the latest commit to `origin` when the user opted in via @@ -401,6 +404,431 @@ export function registerClkTools(pi: ExtensionAPI): void { }, }); + // --------------------------------------------------------------------- + // clk_consensus — stochastic auto-consensus fan-out + // --------------------------------------------------------------------- + pi.registerTool({ + name: "clk_consensus", + label: "CLK Consensus", + description: + "Fan-out N parallel subagent samples for the SAME task; score each via the harness's " + + "quality detector and return the highest-scoring one (plus all candidates for traceability). " + + "Use this instead of clk_subagent whenever an answer is high-stakes (a design choice, a " + + "validation verdict, a non-trivial code edit), or whenever the chief is uncertain. Default " + + "samples=3; clamp 1..6.", + promptSnippet: + "Fan-out N stochastic samples for one task; quality-scored winner returned. " + + "Use liberally for high-stakes or uncertain dispatches.", + parameters: Type.Object({ + agent: Type.String({ + description: "Short role label (e.g. 'engineer', 'designer'). Embed the full persona in the task.", + }), + task: Type.String({ + description: "Complete task description, including role persona and context.", + }), + samples: Type.Optional( + Type.Integer({ minimum: 1, maximum: 6, description: "How many samples to draw. Default 3." }), + ), + preferredModel: Type.Optional( + Type.String({ + description: + "Short alias (claude-opus, claude-sonnet, claude-haiku, gpt-4o, gpt-4o-mini) " + + "or a provider/model string. Omit to use pi's default.", + }), + ), + minChars: Type.Optional( + Type.Integer({ minimum: 0, description: "Override minimum-response-length flag threshold (default 40)." }), + ), + }), + async execute(_id, params, signal, onUpdate, ctx) { + if (signal?.aborted || activeSignal()?.aborted) { + return { content: [{ type: "text", text: "clk_consensus cancelled before start." }], details: {} }; + } + if (!(await tmuxAvailable())) { + return { + content: [{ + type: "text", + text: "clk_consensus unavailable: tmux is not installed. Install it with: brew install tmux / apt install tmux", + }], + details: {}, + }; + } + if (looksRedacted(params.task)) { + return { + content: [{ type: "text", text: `clk_consensus skipped: 'task' appears redacted. ${recoveryHint("redaction")}` }], + details: {}, + }; + } + const sig = mergeSignals(signal, activeSignal()); + const samples = Math.max(1, Math.min(6, params.samples ?? 3)); + try { + const result = await runConsensus({ + agent: params.agent, + task: params.task, + preferredModel: params.preferredModel, + cwd: ctx.cwd, + signal: sig, + samples, + scoreOpts: params.minChars !== undefined ? { minChars: params.minChars } : {}, + onSample: (idx, message) => + onUpdate?.({ + content: [{ type: "text", text: `[consensus #${idx}] ${message}` }], + details: {}, + }), + }); + await appendProgress( + ctx.cwd, + { + kind: "consensus", + message: `${samples} samples for '${params.agent}': ${result.reason}`, + }, + pi, + ); + ctx.ui.setStatus("clk-last", `consensus: ${result.reason.slice(0, 80)}`); + const recap = result.all + .map((s) => + s.error + ? ` #${s.index} error: ${s.error}` + : ` #${s.index} score=${s.quality.score.toFixed(2)} ` + + `(${summarise(s.quality)}) sessionId=${s.sessionId}`, + ) + .join("\n"); + const body = + `Consensus winner (sample #${result.best.index}, score ${result.best.quality.score.toFixed(2)}):\n\n` + + (result.best.output || "(winner produced no output)") + + `\n\n---\nAll samples:\n${recap}`; + return { + content: [{ type: "text", text: body }], + details: { + samples, + winnerIndex: result.best.index, + winnerScore: result.best.quality.score, + allScores: result.all.map((s) => ({ index: s.index, score: s.quality.score, flags: s.quality.flags })), + }, + }; + } catch (err) { + const cls = classifyError(err); + return { + content: [{ type: "text", text: `clk_consensus failed: ${(err as Error).message}. ${recoveryHint(cls)}` }], + details: { error: String(err) }, + }; + } + }, + }); + + // --------------------------------------------------------------------- + // clk_subagent_quality — single subagent + quality re-dispatch loop + // --------------------------------------------------------------------- + pi.registerTool({ + name: "clk_subagent_quality", + label: "CLK Subagent (quality-validated)", + description: + "Dispatch ONE subagent and gate its output through the quality detector. On a recoverable " + + "failure (empty / malformed / low-confidence), re-runs with a repair preamble up to " + + "`maxRetries` extra times. Cheaper than clk_consensus when the task is simple but you still " + + "want a quality gate. Default maxRetries=1.", + promptSnippet: "Single subagent dispatch with automatic quality scoring + repair-preamble re-rolls.", + parameters: Type.Object({ + agent: Type.String({ description: "Short role label." }), + task: Type.String({ description: "Complete task description, including persona." }), + preferredModel: Type.Optional(Type.String()), + maxRetries: Type.Optional( + Type.Integer({ minimum: 0, maximum: 4, description: "Extra dispatches on quality failures. Default 1." }), + ), + minChars: Type.Optional(Type.Integer({ minimum: 0 })), + }), + async execute(_id, params, signal, onUpdate, ctx) { + if (signal?.aborted || activeSignal()?.aborted) { + return { content: [{ type: "text", text: "clk_subagent_quality cancelled before start." }], details: {} }; + } + if (!(await tmuxAvailable())) { + return { + content: [{ type: "text", text: "tmux not installed; cannot dispatch." }], + details: {}, + }; + } + if (looksRedacted(params.task)) { + return { + content: [{ type: "text", text: `clk_subagent_quality skipped: 'task' appears redacted. ${recoveryHint("redaction")}` }], + details: {}, + }; + } + const sig = mergeSignals(signal, activeSignal()); + try { + const result = await dispatchWithQuality({ + agent: params.agent, + task: params.task, + preferredModel: params.preferredModel, + cwd: ctx.cwd, + signal: sig, + maxRetries: params.maxRetries ?? 1, + scoreOpts: params.minChars !== undefined ? { minChars: params.minChars } : {}, + onRetry: (attempt, q) => + onUpdate?.({ + content: [{ + type: "text", + text: `quality retry ${attempt}: ${summarise(q)} — re-rolling with repair preamble`, + }], + details: {}, + }), + }); + ctx.ui.setStatus("clk-last", `quality: ${summarise(result.quality)}`); + const body = + (result.output || "(subagent produced no output)") + + `\n\n---\nquality: ${summarise(result.quality)} after ${result.attempts} attempt(s).`; + return { + content: [{ type: "text", text: body }], + details: { + attempts: result.attempts, + score: result.quality.score, + ok: result.quality.ok, + flags: result.quality.flags, + sessionId: result.sessionId, + }, + }; + } catch (err) { + const cls = classifyError(err); + return { + content: [{ type: "text", text: `clk_subagent_quality failed: ${(err as Error).message}. ${recoveryHint(cls)}` }], + details: { error: String(err) }, + }; + } + }, + }); + + // --------------------------------------------------------------------- + // clk_autoresearch — survey → investigate → critique loop + // --------------------------------------------------------------------- + pi.registerTool({ + name: "clk_autoresearch", + label: "CLK Autoresearch", + description: + "Karpathy-style autoresearch loop: spawn a researcher subagent to investigate the question, " + + "then a critic subagent to review the finding. Repeat for `iterations` cycles. Each finding " + + "and critique is appended to the progress log. Use BEFORE non-trivial implementation work to " + + "ground the chief in real findings rather than priors.", + promptSnippet: + "Iteratively investigate an open question via researcher + critic subagents.", + parameters: Type.Object({ + question: Type.String({ description: "The open question or hypothesis to investigate." }), + iterations: Type.Optional( + Type.Integer({ minimum: 1, maximum: 5, description: "Number of investigate-then-critique cycles. Default 2." }), + ), + preferredModel: Type.Optional(Type.String()), + }), + async execute(_id, params, signal, onUpdate, ctx) { + if (signal?.aborted || activeSignal()?.aborted) { + return { content: [{ type: "text", text: "clk_autoresearch cancelled before start." }], details: {} }; + } + if (!(await tmuxAvailable())) { + return { + content: [{ type: "text", text: "tmux not installed; cannot dispatch." }], + details: {}, + }; + } + if (looksRedacted(params.question)) { + return { + content: [{ type: "text", text: `clk_autoresearch skipped: 'question' appears redacted. ${recoveryHint("redaction")}` }], + details: {}, + }; + } + const sig = mergeSignals(signal, activeSignal()); + const iterations = Math.max(1, Math.min(5, params.iterations ?? 2)); + const log: Array<{ iteration: number; finding: string; critique: string; findingScore: number; critiqueScore: number }> = []; + + for (let i = 1; i <= iterations; i++) { + if (sig?.aborted) break; + onUpdate?.({ + content: [{ type: "text", text: `autoresearch #${i}/${iterations}: investigating` }], + details: {}, + }); + const researcherTask = + `You are a researcher dispatched for autoresearch iteration #${i}. ` + + `Investigate this question deeply and report findings:\n\n${params.question}\n\n` + + (log.length > 0 + ? `Prior findings so far:\n${log.map((l) => `[iter ${l.iteration}] ${l.finding.slice(0, 300)}`).join("\n\n")}\n\n` + : "") + + "Produce concrete findings (cite files, measurements, logs). " + + "End your response with a single line: CONFIDENCE: <0..1>"; + const research = await dispatchWithQuality({ + agent: "researcher", + task: researcherTask, + preferredModel: params.preferredModel, + cwd: ctx.cwd, + signal: sig, + maxRetries: 1, + }); + if (sig?.aborted) break; + onUpdate?.({ + content: [{ type: "text", text: `autoresearch #${i}/${iterations}: critiquing` }], + details: {}, + }); + const criticTask = + `You are a critic for autoresearch iteration #${i}. The researcher reported:\n\n` + + (research.output || "(empty)") + + `\n\nOriginal question:\n${params.question}\n\n` + + "Identify gaps, weak evidence, contradicting facts. Be specific. " + + "End with: CONFIDENCE: <0..1>"; + const critic = await dispatchWithQuality({ + agent: "critic", + task: criticTask, + preferredModel: params.preferredModel, + cwd: ctx.cwd, + signal: sig, + maxRetries: 1, + }); + log.push({ + iteration: i, + finding: research.output, + critique: critic.output, + findingScore: research.quality.score, + critiqueScore: critic.quality.score, + }); + await appendProgress( + ctx.cwd, + { + kind: "autoresearch", + message: + `iter ${i}: research score=${research.quality.score.toFixed(2)} ` + + `critic score=${critic.quality.score.toFixed(2)}`, + }, + pi, + ); + } + const body = + `Autoresearch on: ${params.question}\n\n` + + log.map((l) => + `=== iteration ${l.iteration} ===\n` + + `FINDING (score ${l.findingScore.toFixed(2)}):\n${l.finding}\n\n` + + `CRITIQUE (score ${l.critiqueScore.toFixed(2)}):\n${l.critique}`, + ).join("\n\n"); + ctx.ui.setStatus("clk-last", `autoresearch: ${iterations} iter(s) on ${params.question.slice(0, 40)}`); + return { + content: [{ type: "text", text: body || "(autoresearch produced no iterations — aborted?)" }], + details: { + question: params.question, + iterations: log.length, + findings: log.map((l) => ({ iteration: l.iteration, findingScore: l.findingScore, critiqueScore: l.critiqueScore })), + }, + }; + }, + }); + + // --------------------------------------------------------------------- + // clk_ralph — branch / dispatch / evaluate / commit-or-revert iteration + // --------------------------------------------------------------------- + pi.registerTool({ + name: "clk_ralph", + label: "CLK Ralph Iteration", + description: + "One Ralph iteration: create a feature branch, dispatch a consensus fan-out of N samples, " + + "let the chief inspect the winning output (returned to it), then EITHER keep the branch " + + "(clk_merge) OR abandon it (clk_revert) based on the chief's verdict. The branch creation " + + "and dispatch are enforced in code so the chief can't skip the Ralph protocol. The chief " + + "still drives the accept/reject decision via subsequent clk_merge or clk_revert calls.", + promptSnippet: + "Branch + consensus dispatch one iteration; chief reviews winner and accepts via clk_merge or rejects via clk_revert.", + parameters: Type.Object({ + iterationName: Type.String({ + description: + "Short kebab-case branch suffix, e.g. 'iter-3-optimize-db'. Will be prefixed with 'ralph/'.", + }), + agent: Type.String({ description: "Role label for the dispatched worker." }), + task: Type.String({ description: "Full task description for the worker, including persona." }), + samples: Type.Optional( + Type.Integer({ minimum: 1, maximum: 6, description: "Consensus samples per iteration. Default 3." }), + ), + preferredModel: Type.Optional(Type.String()), + }), + async execute(_id, params, signal, onUpdate, ctx) { + if (signal?.aborted || activeSignal()?.aborted) { + return { content: [{ type: "text", text: "clk_ralph cancelled before start." }], details: {} }; + } + if (!(await tmuxAvailable())) { + return { + content: [{ type: "text", text: "tmux not installed; cannot dispatch." }], + details: {}, + }; + } + if (looksRedacted(params.task) || looksRedacted(params.iterationName)) { + return { + content: [{ type: "text", text: `clk_ralph skipped: parameters appear redacted. ${recoveryHint("redaction")}` }], + details: {}, + }; + } + const sig = mergeSignals(signal, activeSignal()); + const branchName = params.iterationName.startsWith("ralph/") + ? params.iterationName + : `ralph/${params.iterationName}`; + let home = getHomeBranch(); + try { + if (!home) { + home = await currentBranch(ctx.cwd, sig); + await setHomeBranch(ctx.cwd, home, pi); + } + await withRetry(() => createAndCheckoutBranch(ctx.cwd, branchName, sig), { signal: sig }); + } catch (err) { + const cls = classifyError(err); + return { + content: [{ type: "text", text: `clk_ralph failed to create branch '${branchName}': ${(err as Error).message}. ${recoveryHint(cls)}` }], + details: { error: String(err) }, + }; + } + onUpdate?.({ + content: [{ type: "text", text: `ralph: on branch ${branchName}, dispatching ${params.samples ?? 3} samples` }], + details: {}, + }); + + try { + const consensus = await runConsensus({ + agent: params.agent, + task: params.task, + preferredModel: params.preferredModel, + cwd: ctx.cwd, + signal: sig, + samples: params.samples ?? 3, + onSample: (idx, message) => + onUpdate?.({ + content: [{ type: "text", text: `[ralph/${branchName} #${idx}] ${message}` }], + details: {}, + }), + }); + await appendProgress( + ctx.cwd, + { + kind: "ralph", + message: `iteration ${branchName}: ${consensus.reason}`, + }, + pi, + ); + ctx.ui.setStatus("clk-branch", `ralph: ${branchName}`); + const body = + `Ralph iteration on branch ${branchName} — home=${home}.\n\n` + + `Winning sample (#${consensus.best.index}, score ${consensus.best.quality.score.toFixed(2)}):\n\n` + + (consensus.best.output || "(no output)") + + "\n\n---\nReview the winner above. If it advances the goal, accept it with " + + "`clk_merge({message: ''})`. If it doesn't, abandon the branch with " + + "`clk_revert({reason: ''})` (the branch will be preserved for review)."; + return { + content: [{ type: "text", text: body }], + details: { + branch: branchName, + home, + winnerIndex: consensus.best.index, + winnerScore: consensus.best.quality.score, + allScores: consensus.all.map((s) => ({ index: s.index, score: s.quality.score, flags: s.quality.flags })), + }, + }; + } catch (err) { + const cls = classifyError(err); + return { + content: [{ type: "text", text: `clk_ralph dispatch failed on ${branchName}: ${(err as Error).message}. ${recoveryHint(cls)}` }], + details: { error: String(err), branch: branchName }, + }; + } + }, + }); + pi.registerTool({ name: "clk_done", label: "CLK Done", diff --git a/pi-extension/tests/consensus.test.ts b/pi-extension/tests/consensus.test.ts new file mode 100644 index 0000000..3f8608d --- /dev/null +++ b/pi-extension/tests/consensus.test.ts @@ -0,0 +1,213 @@ +/** + * Tests for src/consensus.ts. We inject a fake spawn function so the + * tests don't need tmux or pi installed — the goal is to verify the + * scoring / picking / retry behaviour, not the real subprocess plumbing + * (which is exercised separately by the runtime smoke suite). + */ +import { test, describe } from "node:test"; +import assert from "node:assert/strict"; + +import { + dispatchWithQuality, + runConsensus, + type SpawnFn, +} from "../src/consensus.ts"; + +// Sentinel substring detection — the quality-retry repair preamble is +// rendered by quality.repairHint and begins with this exact phrase. +const REPAIR_MARKER = "Your previous response was rejected"; + +// Comfortably above the empty threshold so the quality detector lets it +// through. Used wherever a test needs a "passing" response body. +const GOOD = "This is a comfortably substantive response that exceeds the empty threshold without question."; + +describe("dispatchWithQuality", () => { + test("returns the first ok response without retrying", async () => { + let calls = 0; + const spawn: SpawnFn = async () => { + calls += 1; + return { output: GOOD, sessionId: `s${calls}` }; + }; + const res = await dispatchWithQuality({ + agent: "worker", + task: "do the thing", + cwd: "/tmp", + spawn, + }); + assert.equal(calls, 1); + assert.equal(res.attempts, 1); + assert.equal(res.quality.ok, true); + assert.equal(res.sessionId, "s1"); + }); + + test("retries with a repair preamble after a recoverable failure", async () => { + let calls = 0; + const taskSeen: string[] = []; + const spawn: SpawnFn = async (opts) => { + calls += 1; + taskSeen.push(opts.task); + if (calls === 1) return { output: "", sessionId: "s1" }; // empty → recoverable + return { output: GOOD, sessionId: "s2" }; + }; + const retries: number[] = []; + const res = await dispatchWithQuality({ + agent: "worker", + task: "first attempt task", + cwd: "/tmp", + maxRetries: 1, + onRetry: (n) => retries.push(n), + spawn, + }); + assert.equal(calls, 2); + assert.equal(res.attempts, 2); + assert.equal(res.quality.ok, true); + assert.deepEqual(retries, [1]); + // First call sees the original task; second sees the repair preamble. + assert.equal(taskSeen[0]?.includes(REPAIR_MARKER), false); + assert.equal(taskSeen[1]?.includes(REPAIR_MARKER), true); + assert.equal(taskSeen[1]?.includes("first attempt task"), true); + }); + + test("stops retrying when maxRetries is exhausted", async () => { + let calls = 0; + const spawn: SpawnFn = async () => { + calls += 1; + return { output: "", sessionId: `s${calls}` }; // always empty + }; + const res = await dispatchWithQuality({ + agent: "worker", + task: "task", + cwd: "/tmp", + maxRetries: 2, + spawn, + }); + assert.equal(calls, 3); // initial + 2 retries + assert.equal(res.attempts, 3); + assert.equal(res.quality.ok, false); + }); + + test("does NOT retry on a non-recoverable failure (refusal)", async () => { + let calls = 0; + const spawn: SpawnFn = async () => { + calls += 1; + return { output: "I cannot help with that. As an AI language model, ...", sessionId: "s1" }; + }; + const res = await dispatchWithQuality({ + agent: "worker", + task: "task", + cwd: "/tmp", + maxRetries: 5, + spawn, + }); + assert.equal(calls, 1); // bailed after the refusal + assert.equal(res.quality.recoverable, false); + }); +}); + +describe("runConsensus", () => { + test("fans out N samples and returns the highest-scoring winner", async () => { + const outputs = ["", GOOD, GOOD + " (more detail)"]; + let issued = 0; + const spawn: SpawnFn = async () => { + const idx = issued++; + return { output: outputs[idx]!, sessionId: `s${idx + 1}` }; + }; + const res = await runConsensus({ + agent: "designer", + task: "design X", + cwd: "/tmp", + samples: 3, + spawn, + }); + assert.equal(res.all.length, 3); + // Two of three samples pass quality (the empty one fails); the + // winner is whichever of the two passing samples sorted higher. + assert.equal(res.best.quality.ok, true); + assert.match(res.best.output, /substantive response/); + // reason names the winner and lists all scores. + assert.match(res.reason, /sample #\d won/); + }); + + test("clamps samples to 1..6", async () => { + let calls = 0; + const spawn: SpawnFn = async () => { + calls += 1; + return { output: GOOD, sessionId: `s${calls}` }; + }; + // samples = 10 should clamp down to 6. + const res = await runConsensus({ + agent: "designer", + task: "x", + cwd: "/tmp", + samples: 10, + spawn, + }); + assert.equal(res.all.length, 6); + }); + + test("captures spawn errors as sample.error without throwing", async () => { + let calls = 0; + const spawn: SpawnFn = async () => { + calls += 1; + if (calls === 2) throw new Error("tmux gone"); + return { output: GOOD, sessionId: `s${calls}` }; + }; + const res = await runConsensus({ + agent: "designer", + task: "x", + cwd: "/tmp", + samples: 3, + spawn, + }); + assert.equal(res.all.length, 3); + const failed = res.all.find((s) => s.error); + assert.ok(failed, "expected one sample to carry an error"); + assert.match(failed!.error!, /tmux gone/); + // Winner is still one of the successful samples, never the errored one. + assert.notEqual(res.best.index, failed!.index); + assert.equal(res.best.quality.ok, true); + }); + + test("returns the least-bad sample even when all fail", async () => { + const outputs = ["", "I cannot help.", ""]; // all bad + let issued = 0; + const spawn: SpawnFn = async () => { + const idx = issued++; + return { output: outputs[idx]!, sessionId: `s${idx + 1}` }; + }; + const res = await runConsensus({ + agent: "designer", + task: "x", + cwd: "/tmp", + samples: 3, + spawn, + }); + assert.equal(res.all.length, 3); + assert.equal(res.best.quality.ok, false); + // The refusal has score 0.5, the empties have 0.4 — so the refusal wins on score + // but the test we really care about is that runConsensus picked SOMETHING and + // never threw. + assert.ok(typeof res.best.output === "string"); + }); + + test("respects maxParallel by capping concurrent spawns", async () => { + let inFlight = 0; + let peak = 0; + const spawn: SpawnFn = async () => { + inFlight += 1; + peak = Math.max(peak, inFlight); + await new Promise((r) => setTimeout(r, 10)); + inFlight -= 1; + return { output: GOOD, sessionId: "s" }; + }; + await runConsensus({ + agent: "x", + task: "t", + cwd: "/tmp", + samples: 6, + maxParallel: 2, + spawn, + }); + assert.ok(peak <= 2, `expected peak in-flight ≤ 2, got ${peak}`); + }); +}); diff --git a/pi-extension/tests/index.test.ts b/pi-extension/tests/index.test.ts index c17a6cf..14cb42a 100644 --- a/pi-extension/tests/index.test.ts +++ b/pi-extension/tests/index.test.ts @@ -73,10 +73,28 @@ describe("clkExtension default export", () => { await clkExtension(pi as any); const toolNames = tools.map((t) => t.name); - for (const required of ["clk_cast", "clk_progress", "clk_checkpoint", "clk_done"]) { + // Core orchestration + git plumbing tools. + const required = [ + "clk_cast", + "clk_progress", + "clk_checkpoint", + "clk_revert", + "clk_branch", + "clk_merge", + "clk_done", + // New code-enforced orchestration loops (ported from the Python + // harness's response_quality / consensus / autoresearch / ralph + // modules — see src/quality.ts, src/consensus.ts). + "clk_consensus", + "clk_subagent_quality", + "clk_autoresearch", + "clk_ralph", + "clk_subagent", + ]; + for (const name of required) { assert.ok( - toolNames.includes(required), - `tool ${required} not registered (got ${toolNames.join(", ")})`, + toolNames.includes(name), + `tool ${name} not registered (got ${toolNames.join(", ")})`, ); } assert.ok(commands["clk"], "/clk command was not registered"); diff --git a/pi-extension/tests/prompts.test.ts b/pi-extension/tests/prompts.test.ts index 657c75e..4a2345f 100644 --- a/pi-extension/tests/prompts.test.ts +++ b/pi-extension/tests/prompts.test.ts @@ -15,7 +15,16 @@ describe("clkChiefPrimer", () => { test("mentions the core CLK tools", () => { const out = clkChiefPrimer("anything"); - for (const tool of ["clk_cast", "clk_subagent", "clk_checkpoint", "clk_done"]) { + for (const tool of [ + "clk_cast", + "clk_subagent", + "clk_subagent_quality", + "clk_consensus", + "clk_autoresearch", + "clk_ralph", + "clk_checkpoint", + "clk_done", + ]) { assert.ok(out.includes(tool), `primer should reference ${tool}`); } }); diff --git a/pi-extension/tests/quality.test.ts b/pi-extension/tests/quality.test.ts new file mode 100644 index 0000000..c250ba4 --- /dev/null +++ b/pi-extension/tests/quality.test.ts @@ -0,0 +1,126 @@ +/** + * Unit tests for src/quality.ts — pure regex/string scoring, no I/O. + * Mirrors tests/test_response_quality.py in the Python harness so a + * behaviour drift between the two implementations shows up here. + */ +import { test, describe } from "node:test"; +import assert from "node:assert/strict"; + +import { + scoreResponse, + repairHint, + isRecoverable, + summarise, +} from "../src/quality.ts"; + +describe("scoreResponse — happy paths", () => { + test("substantive prose passes with score 1.0", () => { + const text = "This is a substantive response covering the requested work in detail. " + + "It explains the approach, lists the files touched, and states next steps so the " + + "chief can keep moving without a re-roll."; + const q = scoreResponse(text); + assert.equal(q.ok, true); + assert.equal(q.score, 1.0); + assert.deepEqual(q.flags, []); + }); + + test("substantive prose with a CONFIDENCE line is still ok", () => { + const text = "Substantive enough response that exceeds the forty-character " + + "minimum easily.\nCONFIDENCE: 0.82"; + const q = scoreResponse(text); + assert.equal(q.ok, true); + assert.equal(q.confidence, 0.82); + }); +}); + +describe("scoreResponse — failure modes", () => { + test("empty body flags as 'empty' and is recoverable", () => { + const q = scoreResponse(""); + assert.equal(q.ok, false); + assert.ok(q.flags.includes("empty")); + assert.equal(q.recoverable, true); + }); + + test("near-empty body flags as 'empty' too", () => { + const q = scoreResponse("hi."); + assert.equal(q.ok, false); + assert.ok(q.flags.includes("empty")); + }); + + test("refusal phrase is flagged and NOT recoverable", () => { + const q = scoreResponse("I cannot help with that request. As an AI language model, ..."); + assert.equal(q.ok, false); + assert.ok(q.flags.includes("refusal")); + assert.equal(q.recoverable, false); + }); + + test("missing END_ACTION imbalance is flagged", () => { + const text = "Plenty of text here so we beat the empty threshold easily " + + "and definitely.\nACTION: write_file\nfoo\n"; // no END_ACTION + const q = scoreResponse(text); + assert.ok(q.flags.includes("malformed_action")); + }); + + test("missing END_POST imbalance is flagged", () => { + const text = "More than forty characters of preamble so the empty check passes.\n" + + "POST: my_topic\nbody\n"; + const q = scoreResponse(text); + assert.ok(q.flags.includes("malformed_post")); + }); + + test("low CONFIDENCE value gets the low_confidence flag", () => { + const text = "Long enough body to clear the empty threshold without question.\nCONFIDENCE: 0.10"; + const q = scoreResponse(text); + assert.ok(q.flags.includes("low_confidence")); + assert.equal(q.confidence, 0.1); + }); + + test("NEEDS_REVIEW: true flips needs_review_self", () => { + const text = "Body comfortably over the forty character minimum so empty does not fire.\n" + + "NEEDS_REVIEW: true"; + const q = scoreResponse(text); + assert.equal(q.needsReview, true); + assert.ok(q.flags.includes("needs_review_self")); + }); + + test("missing expected output keys gets outputs_missing flag", () => { + const text = "A response body comfortably exceeding the minimum length threshold.\n" + + "POST: t1\nPRODUCES: foo, bar\nbody\nEND_POST"; + const q = scoreResponse(text, { expectedOutputs: ["foo", "missing_one"] }); + assert.ok(q.flags.includes("outputs_missing")); + assert.match(q.reasons.join(" "), /missing_one/); + }); + + test("requireConfidence flag fires when CONFIDENCE absent", () => { + const text = "Long enough body to comfortably clear the minimum length threshold."; + const q = scoreResponse(text, { requireConfidence: true }); + assert.ok(q.flags.includes("confidence_missing")); + }); +}); + +describe("repairHint / isRecoverable / summarise", () => { + test("repairHint returns an empty string for an ok response", () => { + const q = scoreResponse("Long-enough response that passes the empty threshold."); + assert.equal(repairHint(q), ""); + }); + + test("repairHint quotes every reason as a bullet for failed responses", () => { + const q = scoreResponse("hi"); + const hint = repairHint(q); + assert.match(hint, /rejected by the harness/i); + assert.match(hint, /minimum 40/); + }); + + test("isRecoverable is true for recoverable failures, false for refusals", () => { + assert.equal(isRecoverable(scoreResponse("")), true); + assert.equal(isRecoverable(scoreResponse("I cannot help with this.")), false); + assert.equal(isRecoverable(scoreResponse("ok and substantive response over the minimum.")), false); + }); + + test("summarise gives a compact one-line description", () => { + const ok = scoreResponse("Long substantive response well over the minimum threshold."); + assert.match(summarise(ok), /^ok score=/); + const bad = scoreResponse(""); + assert.match(summarise(bad), /^flags=empty score=/); + }); +}); From 848faff01da38d46ca4ca98185b006e9f6d4e739 Mon Sep 17 00:00:00 2001 From: Claude Date: Sun, 24 May 2026 16:33:17 +0000 Subject: [PATCH 3/3] =?UTF-8?q?docs:=20full=20README=20sweep=20=E2=80=94?= =?UTF-8?q?=20pi-extension=20orchestration=20loops,=20auto-push,=20doctor?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Updates both READMEs to reflect the orchestration work that just landed in pi-extension and the recent main-line PRs (push-on-commit, doctor / diag CLI, multi-line truncation fix) that already shipped to master but weren't fully cross-referenced. pi-extension/README.md (full rewrite, +293 net lines) * Replaces the "8 small tools" narrative with a proper Tool Reference that groups roster / dispatch / iterative-refinement and explains when to pick clk_subagent vs clk_subagent_quality vs clk_consensus vs clk_autoresearch vs clk_ralph. * New "Response-quality scoring" section listing every flag the detector raises and how the repair-preamble loop quotes them back to the worker. Cross-references the Python harness's response_quality.py so behaviour drift between the two implementations is one diff away from being noticed. * New "Auto-push (opt-in)" section covering CLK_GITHUB_PUSH_ON_COMMIT, the ↑N ahead counter, and the pre-push secret-scanner interaction. * Commands table extended with /clk-help, /clk-doctor, /clk-undo (these existed in the code but the README only listed /clk and /clk-abort). * "What you keep / what changes" tables rewritten: stochastic consensus, quality re-dispatch, and Ralph refinement are now described as code-enforced (not chief-compliance dependent), and the comparison row about robustness loops names the new tools as the per-call equivalents of the Python harness's clk.config.json::robustness.* knobs. * Repository layout updated with src/quality.ts, src/consensus.ts, the new test files, and explicit per-file purposes. * "Testing" section reflects the real 96-test count and notes the suite runs entirely offline (consensus tests inject a fake spawn). README.md (main) — targeted updates * Pi extension section: brief but accurate rundown of the new orchestration tools, a Commands table that matches /clk-help, the CLK_GITHUB_PUSH_ON_COMMIT env var, and an updated example transcript that uses clk_consensus / clk_autoresearch / clk_ralph by name rather than the "fans out to 3 subagents" abstraction. * Layout section: pi-extension/ subtree expanded to show every src/ file with a one-line purpose, including the new quality.ts and consensus.ts. * Testing section: pi-extension test count corrected from 53 to 96 (~1s → ~2s), and the per-suite description rewritten to name the new modules (quality / consensus / git auto-push helpers / firstLineShort) so a contributor browsing the README knows what is and isn't covered. --- README.md | 92 ++++++-- pi-extension/README.md | 474 +++++++++++++++++++++++++++-------------- 2 files changed, 385 insertions(+), 181 deletions(-) diff --git a/README.md b/README.md index 9e56a0d..a340e1e 100644 --- a/README.md +++ b/README.md @@ -1011,9 +1011,15 @@ orchestration model — dynamic casting, stochastic consensus, Ralph refinement, and Karpathy-style autoresearch — into Pi behind a single `/clk` command. No Python harness required at runtime. -See [`pi-extension/README.md`](pi-extension/README.md) for full -documentation including tool reference, state layout, error handling, -and customization notes. Quick summary: +The TypeScript extension now ports the harness's response-quality +scoring and consensus fan-out as **real tools** (`clk_consensus`, +`clk_subagent_quality`, `clk_autoresearch`, `clk_ralph`) rather than +relying on chief compliance — every parallel sample is scored by the +same rules `clk_harness/orchestration/response_quality.py` uses, the +winner is picked in code, and Ralph branches are created by the tool so +the protocol can't be skipped. See [`pi-extension/README.md`](pi-extension/README.md) +for the full tool reference, state layout, error handling, and +customisation notes. **Requirements:** Pi on `PATH`; tmux on `PATH`; Git on `PATH`. @@ -1025,12 +1031,35 @@ and customization notes. Quick summary: | Project-local | `mkdir -p .pi/extensions && ln -s /path/to/CognitiveLoopKernel/pi-extension .pi/extensions/clk` | Version-controlled per project | | Global | `mkdir -p ~/.pi/agent/extensions && ln -s /path/to/CognitiveLoopKernel/pi-extension ~/.pi/agent/extensions/clk` | Available in every Pi session | -**Usage:** +**Commands:** | Command | Effect | |---------|--------| | `/clk ` | Capture the idea and hand off to the chief. Resumes if state exists. | | `/clk-abort` | End the active run. State is preserved; resume with `/clk` later. | +| `/clk-help` | List every CLK slash command, every orchestration tool the chief uses, and the active safety nets. | +| `/clk-doctor` | Health-check tmux, git, the workspace `.clk/` layout, the pre-push hook, and (when a remote exists) the count of local commits not yet pushed. | +| `/clk-undo` | Preview the last CLK commit; `/clk-undo confirm` creates a revert commit on top of it. | + +**Orchestration tools the chief uses (you don't call these directly):** + +| Tool | Purpose | +|---|---| +| `clk_cast` | Persist a roster of project-specific specialist roles. | +| `clk_subagent` | Raw single-subagent dispatch via a detached tmux pi session. | +| `clk_subagent_quality` | One subagent + automatic repair-preamble re-rolls on quality failures. | +| `clk_consensus` | Fan out N parallel samples (default 3, max 6), score each, return the winner plus every candidate's score. | +| `clk_autoresearch` | Bounded researcher + critic alternation; each iteration recorded on the progress log. | +| `clk_ralph` | Create a `ralph/` branch and run a consensus fan-out in one call; chief then calls `clk_merge` or `clk_revert`. | +| `clk_branch` / `clk_merge` / `clk_revert` / `clk_checkpoint` | Git plumbing for the Ralph iteration cycle. | +| `clk_progress` | Append a one-line entry to `.clk/state/progress.md`. | +| `clk_done` | Mark the run complete and write `.clk/state/done.md`. | + +**Optional env vars:** + +| Variable | Effect | +|---|---| +| `CLK_GITHUB_PUSH_ON_COMMIT=true` | After every `clk_checkpoint` and `clk_merge`, run `git push origin HEAD` best-effort and surface an `↑N` ahead counter if the push fails. Same env var as the Python TUI. | A typical session: @@ -1038,10 +1067,12 @@ A typical session: > /clk a local-first journaling app that summarizes my week [CLK run started. The chief is taking over.] [chief casts engineer, ux_writer, summarizer, qa] -[chief fans out to 3 parallel architecture subagents → judge synthesizes] -[chief dispatches worker to implement MVP] -[chief calls clk_checkpoint: "MVP: capture + persist entries"] -[chief opens feature branch with clk_branch, runs Ralph iteration ...] +[chief calls clk_consensus({agent:"architect", samples:3, task:"... storage design ..."})] +[harness fans out 3 parallel tmux pi subagents, scores each, returns the winner] +[chief calls clk_autoresearch({question:"sync model: append-only vs CRDT?"})] +[chief calls clk_ralph({iterationName:"iter-1-mvp", agent:"engineer", task:"... build MVP ..."})] +[chief calls bash: pytest -q] +[chief calls clk_merge: "ralph win: MVP capture+persist+summarize"] [chief calls clk_done: "MVP runs; tests pass; README + deploy plan present"] ``` @@ -1069,7 +1100,19 @@ scripts/ tests/ # pytest regression suite (CI-gated) user_tests/ # pytest end-to-end suite (drives CLI + REST API) pi-extension/ # standalone Pi extension (TypeScript) - tests/ # node --test suites (errors, prompts, state, git, index) + src/ + index.ts # /clk + /clk-help + /clk-doctor + /clk-undo, session lifecycle + prompts.ts # the chief's operator's manual + tools.ts # clk_cast / clk_progress / clk_checkpoint / clk_branch / + # clk_merge / clk_revert / clk_consensus / clk_subagent_quality / + # clk_autoresearch / clk_ralph / clk_done + subagent.ts # raw clk_subagent — spawnSubagent() exposed for consensus + consensus.ts # dispatchWithQuality + runConsensus (port of agent.py) + quality.ts # scoreResponse + repairHint (port of response_quality.py) + git.ts # checkpoint, branch, merge, revert + hasRemote / commitsAhead / + # pushBestEffort (port of git_ops.py auto-push helpers) + state.ts / abort.ts / errors.ts / types.ts + tests/ # node --test suites covering every file in src/ docs/ REST_API.md # full REST API reference ``` @@ -1801,7 +1844,7 @@ pytest user_tests/ -v # Pi extension TypeScript suite cd pi-extension npm install -npm test # unit + integration tests (53 tests, ~1s) +npm test # unit + integration tests (96 tests, ~2s) npm run test:strict # also runs `tsc --noEmit` ``` @@ -1824,14 +1867,31 @@ The `pi-extension/tests/` suite verifies: - `classifyError`, `withRetry`, `looksRedacted`, `isMaxTurnsResult`, and all `recoveryHint` branches. -- `clkChiefPrimer` renders the captured idea + all CLK tool names. +- `clkChiefPrimer` renders the captured idea + every CLK tool name + (`clk_cast`, `clk_subagent`, `clk_subagent_quality`, `clk_consensus`, + `clk_autoresearch`, `clk_ralph`, `clk_checkpoint`, `clk_done`). +- `scoreResponse` flags every documented failure mode (empty / refusal / + malformed ACTION / malformed POST / missing outputs / low confidence / + needs-review / missing-confidence) and `repairHint` quotes each reason + to the worker. +- `runConsensus` fans out N samples, scores them, picks the winner, caps + to `maxParallel`, and captures spawn errors without throwing. + `dispatchWithQuality` retries with a repair preamble on recoverable + failures and stops on refusal or `maxRetries`. - `setIdea`, `setRoster`, `appendProgress`, `markDone`, `isDone` round-trip state through `.clk/state/*.json` and `progress.md`. -- The `git` wrapper does init, checkpoint, branch, merge, and revert - correctly against a real `git` binary. -- The extension's `default` export registers the documented tools - (`clk_cast`, `clk_progress`, `clk_checkpoint`, `clk_done`) and the - `/clk` slash command, and handles an empty-idea invocation cleanly. +- The `git` wrapper does init, checkpoint, branch, merge, revert, + `hasRemote`, `commitsAhead`, and `pushBestEffort` correctly against a + real `git` binary (including the bare-upstream sync, the unreachable- + remote failure path, and the no-remote no-op). +- The extension's `default` export registers every documented tool + (`clk_cast`, `clk_progress`, `clk_checkpoint`, `clk_revert`, + `clk_branch`, `clk_merge`, `clk_done`, `clk_consensus`, + `clk_subagent_quality`, `clk_autoresearch`, `clk_ralph`, + `clk_subagent`) and the `/clk` slash command, and handles an + empty-idea invocation cleanly. +- `firstLineShort` returns single-line, capped output so a multi-line + idea never bleeds line 2 into the Pi status bar. ## Customization diff --git a/pi-extension/README.md b/pi-extension/README.md index 920f9e1..62d4213 100644 --- a/pi-extension/README.md +++ b/pi-extension/README.md @@ -1,13 +1,15 @@ # CLK as a Pi extension -A lightweight [pi.dev](https://pi.dev) extension that brings the Cognitive Loop -Kernel orchestration model — dynamic agent casting, stochastic consensus, -Ralph refinement, and Karpathy-style autoresearch — into Pi behind a single -`/clk` command. +A [pi.dev](https://pi.dev) extension that brings the full Cognitive Loop +Kernel orchestration model — dynamic agent casting, stochastic +consensus, Karpathy-style autoresearch, and Ralph refinement — into Pi +behind a single `/clk` command. -> **Experimental.** Companion to the Python [CLK harness](../README.md) in the -> parent repo, but standalone: this extension does not depend on that harness -> at runtime. It targets Pi natively. Use at your own risk. +> **Experimental.** Companion to the Python [CLK harness](../README.md) +> in the parent repo, but standalone: this extension does not depend on +> that harness at runtime. It is a self-contained TypeScript port of +> the parts of `clk_harness/orchestration/` that make sense inside Pi. +> Use at your own risk. ## What it does @@ -19,70 +21,146 @@ You type: The extension: -1. Captures the idea, initialises a git repo if needed, and persists state - under `.clk/state/`. -2. Hands control to the chief LLM with a CLK operator's manual (see - [`src/prompts.ts`](src/prompts.ts)) that establishes standing rules: - cast a team, dispatch via the `clk_subagent` tool, apply parallel consensus - on high-stakes decisions, run Ralph refinement after MVP, autoresearch - on open questions, checkpoint after every win, revert on regression, - call `clk_done` when every completion criterion is met. -3. Provides the chief with eight small tools — `clk_cast`, `clk_progress`, - `clk_checkpoint`, `clk_branch`, `clk_revert`, `clk_merge`, `clk_done`, - and `clk_subagent` — that handle persistence, git mechanics, and subagent - dispatch. `clk_branch` opens a per-iteration feature branch before each - Ralph pass, `clk_merge` folds it into the home branch on success, and - `clk_revert` discards the branch without merging when the iteration is - rejected. `clk_subagent` spawns a detached tmux pi session and streams - its result back when it exits. Everything else (fan-out, judging, - refinement loops) is the chief driving Pi's built-in tools. - -The extension itself is intentionally thin: orchestration policy lives in the -chief's prompt, not in TypeScript. To change CLK's behavior, edit -[`src/prompts.ts`](src/prompts.ts). +1. Captures the idea, initialises a git repo if needed, and persists + state under `.clk/state/`. +2. Installs hardened safety nets in the project (`.gitignore`, + pre-push secret-scan hook). +3. Hands control to the chief LLM with an operator's manual (see + [`src/prompts.ts`](src/prompts.ts)) that establishes standing rules + for casting, dispatching, consensus, autoresearch, Ralph + refinement, checkpointing, completion criteria, and error recovery. +4. Provides the chief with a suite of orchestration tools (see + [Tool reference](#tool-reference)) that fan out parallel subagent + samples via tmux, score every output with the same response-quality + rules the Python harness uses, manage git branches for Ralph + iterations, and persist progress. + +Unlike the original incarnation of this extension, **orchestration +policy is now enforced in code**, not only in the chief's prompt: +`clk_consensus` actually spawns N parallel tmux sessions and scores +them; `clk_subagent_quality` actually re-rolls failures with a repair +preamble; `clk_autoresearch` actually alternates a researcher and +critic; `clk_ralph` actually creates the branch and runs the fan-out. +The chief can't accidentally skip these steps by misreading the +prompt — the tools enforce the shape. ## Commands | Command | What it does | |---|---| | `/clk ` | Start a CLK run. Casts a team, dispatches them, runs Ralph + autoresearch. | -| `/clk-abort` | End the current run. State is preserved for resume. | -| `/clk-help` | List every CLK command and the safety nets active in the workspace. | -| `/clk-doctor` | Health-check `tmux`, `git`, the `.clk/` layout, `.gitignore`, and the pre-push hook. Pure environment checks; no Pi calls. | -| `/clk-undo` | Preview the last CLK commit; `/clk-undo confirm` creates a revert commit on top of it. | +| `/clk-abort` | End the active run. State is preserved for resume. | +| `/clk-help` | List every CLK command, every orchestration tool the chief uses, and the safety nets active in the workspace. | +| `/clk-doctor` | Health-check tmux, git, the `.clk/` layout, `.gitignore`, the pre-push hook, and (when a remote exists) the count of local commits not yet pushed. Pure environment checks; no Pi calls. | +| `/clk-undo` | Preview the last CLK commit; `/clk-undo confirm` creates a revert commit on top of it. Refuses if there are uncommitted changes. | + +## Tool reference + +The chief invokes these as `clk_*` tools — you do not call them from +slash commands. They are listed here so you know what your run is doing +when you read the progress log or notifications. + +### Roster + status + +| Tool | Purpose | +|---|---| +| `clk_cast` | Persist a roster of project-specific specialist roles (name, mission, persona). The chief authors the roster on the fly. | +| `clk_progress` | Append a one-line entry to `.clk/state/progress.md`. Used at every meaningful transition (dispatch / consensus / ralph / autoresearch / branch / merge / done / note). | + +### Dispatch (pick the right one) + +| Tool | Use when… | +|---|---| +| `clk_subagent({ agent, task, preferredModel? })` | Cheap, low-risk single-subagent dispatch with no quality gate. Reserve for genuinely throwaway work. | +| `clk_subagent_quality({ agent, task, maxRetries?, preferredModel?, minChars? })` | One subagent **scored by the quality detector**, with up to `maxRetries` automatic repair re-rolls. Default for any single-worker task where you'd rather catch bad output than propagate it. | +| `clk_consensus({ agent, task, samples?, preferredModel?, minChars? })` | Fan out N parallel samples (default 3, clamped 1..6), score each, return the highest-scoring winner plus every candidate. Use liberally for design choices, architecture, validation verdicts, security/perf reviews, ambiguous requirements. | +| `clk_autoresearch({ question, iterations?, preferredModel? })` | Bounded `researcher` + `critic` alternation (default 2 iterations, clamped 1..5). Each finding and critique is recorded on the progress log. Use before non-trivial implementation work whenever the optimal approach is unclear. | + +### Iterative refinement + +| Tool | Use when… | +|---|---| +| `clk_branch({ name })` | Manually open a feature branch for an iteration. Records the home branch automatically. | +| `clk_ralph({ iterationName, agent, task, samples?, preferredModel? })` | One-call Ralph iteration: creates `ralph/`, fans out a consensus dispatch, returns the winner. Chief then runs validation and calls `clk_merge` (accept) or `clk_revert` (reject). The branch creation + fan-out happen in one step and can't be skipped. | +| `clk_checkpoint({ message })` | Stage all working-tree changes and create a `[clk] ` commit. Returns the new HEAD SHA. When `CLK_GITHUB_PUSH_ON_COMMIT=true` and an `origin` remote exists, also runs `git push origin HEAD` best-effort. | +| `clk_merge({ message })` | Commit any pending changes on the feature branch, merge it into the home branch with `--no-ff`, return to home. Same auto-push behavior as `clk_checkpoint`. | +| `clk_revert({ reason })` | Commit any pending work on the rejected branch (preserving it), switch back to home without merging. The rejected branch is **never** deleted. | +| `clk_done({ reason })` | Mark the run complete. Writes `.clk/state/done.md`, ends the run lifecycle. Only call when every completion criterion in [`src/prompts.ts`](src/prompts.ts) is satisfied. | + +### Response-quality scoring + +Every `clk_subagent_quality`, `clk_consensus`, and `clk_autoresearch` +output is scored by [`src/quality.ts`](src/quality.ts) (TypeScript port +of `clk_harness/orchestration/response_quality.py`). The scorer flags: + +- **empty** — body shorter than `minChars` (default 40) +- **refusal** — refusal phrases ("I cannot", "as an AI language model", ...) — marked non-recoverable so the harness escalates instead of re-rolling +- **malformed_action** — `ACTION:` headers without matching `END_ACTION` +- **malformed_post** — `POST:` headers without matching `END_POST` +- **outputs_missing** — declared output keys not present in any `POST` block's `PRODUCES:` list +- **low_confidence** — a parsed `CONFIDENCE: <0..1>` line below 0.5 +- **needs_review_self** — a `NEEDS_REVIEW: true` line +- **confidence_missing** — no `CONFIDENCE:` line at all (only when the caller passes `requireConfidence: true`) + +Each flag carries a short repair reason; on a recoverable failure the +caller re-dispatches with a preamble that quotes every reason back to +the worker so it fixes the specific issues rather than re-rolling at +random. Same protocol the Python harness uses in +`agent.py::_dispatch_with_quality_loop`. ## Safety nets -The extension installs the same safety nets the Python harness uses, so -running CLK from Pi is just as recoverable: - -- **Hardened `.gitignore`.** On the first `/clk`, the extension writes a - `.gitignore` that blocks `.env`, `.env.bak`, `.env.partial`, `*.pem`, - `*.key`, `*_id_rsa*`, `/secrets/`, plus editor / OS junk. Existing - `.gitignore` content is never clobbered. -- **Pre-push secret scanner.** A `.git/hooks/pre-push` hook (pure bash, - no extra deps) scans the about-to-be-pushed objects for obvious API - key patterns (`ANTHROPIC_API_KEY=…`, `OPENAI_API_KEY=…`, `sk-…`, - Slack `xoxb-…`, private-key headers). On a hit it aborts the push. - Bypass once with `git push --no-verify`. +The extension installs the same safety nets the Python harness uses, +so running CLK from Pi is just as recoverable: + +- **Hardened `.gitignore`.** On the first `/clk`, the extension writes + a `.gitignore` that blocks `.env`, `.env.bak`, `.env.partial`, + `*.pem`, `*.key`, `*_id_rsa*`, `/secrets/`, plus editor / OS junk. + Existing `.gitignore` content is never clobbered. +- **Pre-push secret scanner.** A `.git/hooks/pre-push` hook (pure + bash, no extra deps) scans the about-to-be-pushed objects for + obvious API-key patterns (`ANTHROPIC_API_KEY=…`, `OPENAI_API_KEY=…`, + `sk-…`, Slack `xoxb-…`, private-key headers). On a hit it aborts the + push. Bypass once with `git push --no-verify`. - **Atomic state writes.** Every state file under `.clk/state/` (`clk.json`, `idea.json`, `roster.json`, `done.md`) is written via `tmp+rename` with a `.bak` rotation, so a crash mid-write leaves either the old or the new file intact — never half. -- **`restoreBackup` primitive.** Exposed from `src/state.ts` so a - future "undo last state change" can swap a `.bak` back deterministically. +- **`restoreBackup` primitive.** Exposed from `src/state.ts` for + programmatic recovery of `.bak` snapshots. +- **AbortController cancellation.** `/clk-abort` and session shutdown + fire a single abort signal that propagates to every in-flight tmux + subagent session, every git operation, and every backoff sleep. + +## Auto-push (opt-in) + +Set `CLK_GITHUB_PUSH_ON_COMMIT=true` in the Pi environment to have the +extension auto-push after every `clk_checkpoint` and `clk_merge`. The +push is best-effort — on failure (no network, no upstream, rejected by +the pre-push hook), the run continues and the `clk-git` status bar +flips to `↑N` showing the count of unpushed commits. + +When the env var is **unset** but the repo has an `origin` remote, the +`clk-git` status bar still surfaces an `↑N unpushed` hint so you know +what's accumulated locally. `/clk-doctor` includes the same count as +a `! warn` row. + +The pre-push secret scanner runs *before* the auto-push leaves your +machine, so an accidental commit containing an API key still gets +blocked. ## Requirements - Pi installed and on `PATH` (`pi --version` works). -- tmux installed and on `PATH` (`tmux -V` works). The extension spawns each - subagent as a detached tmux pi session — this is how it achieves true - process isolation without depending on any external Pi extension. - Install: `brew install tmux` (macOS) or `apt install tmux` (Debian/Ubuntu). - On `session_start` the extension checks for tmux and emits a one-time - warning if it's missing. -- Git on `PATH` (the extension auto-runs `git init` in the project root if - there's no repo yet). +- tmux installed and on `PATH` (`tmux -V` works). The extension spawns + each subagent as a detached tmux pi session — this is how it + achieves true process isolation without depending on any external + Pi extension. Install: `brew install tmux` (macOS) or + `apt install tmux` (Debian/Ubuntu). On `session_start` the + extension checks for tmux and emits a one-time warning if it's + missing. +- Git on `PATH` (the extension auto-runs `git init` in the project + root if there's no repo yet). - Node 20+ (Pi already requires this; only relevant if you want `AbortSignal.any` for the cleanest cancel behavior). @@ -92,8 +170,8 @@ Three options. Pick whichever matches your workflow. ### Option A: Quick test (`-e`, no install) -Best for trying it out or iterating on the extension itself. Pi loads the -file directly and reloads on `/reload`: +Best for trying it out or iterating on the extension itself. Pi loads +the file directly and reloads on `/reload`: ```bash pi -e /path/to/CognitiveLoopKernel/pi-extension/src/index.ts @@ -108,8 +186,8 @@ mkdir -p .pi/extensions ln -s /path/to/CognitiveLoopKernel/pi-extension .pi/extensions/clk ``` -Pi auto-discovers `.pi/extensions/*/index.ts` on startup. The chief's tools -appear in every Pi session opened in this project. +Pi auto-discovers `.pi/extensions/*/index.ts` on startup. The chief's +tools appear in every Pi session opened in this project. ### Option C: Global install (all projects) @@ -132,31 +210,32 @@ Or list it explicitly in `~/.pi/agent/settings.json`: ## Usage -| Command | Effect | -|-----------------|------------------------------------------------------------------------| -| `/clk ` | Capture the idea and hand off to the chief. Resumes if state exists. | -| `/clk-abort` | End the active run. Cancels the chief's current turn and signals all in-flight subagents to stop. State on disk is preserved; you can `/clk` again later. | - -Cancel mid-turn with **Esc** (Pi's built-in) — that cancels the current model -call but leaves the CLK run lifecycle intact, so the chief can be steered and -continue. Use `/clk-abort` when you want to end the whole run. +Cancel mid-turn with **Esc** (Pi's built-in) — that cancels the current +model call but leaves the CLK run lifecycle intact, so the chief can be +steered and continue. Use `/clk-abort` when you want to end the whole +run. A typical first transcript looks like: ```text > /clk a local-first journaling app that summarizes my week [notification] CLK run started. The chief is taking over. -[chief] (calls clk_cast with engineer, ux_writer, summarizer, qa) -[chief] (calls clk_subagent: scout to understand existing layout) -[chief] (calls clk_subagent x3 in parallel: 3 architectures for storage) -[chief] (calls clk_subagent: oracle to judge architectures) -[chief] (calls clk_progress: consensus → SQLite + JSON sidecar) -[chief] (calls clk_subagent: worker to implement MVP) -[chief] (calls bash: pytest -q) -[chief] (calls clk_checkpoint: "MVP: capture + persist entries") -[chief] (enters Ralph loop ...) -... -[chief] (calls clk_done: "MVP runs; tests pass; README + deploy plan + checklist + CLI all present") +[chief] clk_cast({ engineer, ux_writer, summarizer, qa }) +[chief] clk_consensus({ agent:"architect", samples:3, + task:"3 storage designs for offline-first journal" }) +[harness] consensus :: 3 samples, winner #2 score=0.92 (all: #1=0.74 #2=0.92 #3=0.81) +[chief] clk_progress({ kind:"consensus", message:"3 samples for architect: ..." }) +[chief] clk_autoresearch({ question:"sync model: append-only vs CRDT for journals?", iterations:2 }) +[harness] autoresearch #1/2: investigating → critiquing +[harness] autoresearch #2/2: investigating → critiquing +[chief] clk_ralph({ iterationName:"iter-1-mvp", agent:"engineer", samples:3, + task:"... implement MVP from winning architecture ..." }) +[harness] on branch ralph/iter-1-mvp, 3 samples, winner #1 score=0.88 +[chief] bash({ command:"pytest -q" }) +[chief] clk_merge({ message:"ralph win: MVP capture+persist+summarize" }) +[harness] merged ralph/iter-1-mvp → main; clk-git: synced +... (Ralph iterations continue) ... +[chief] clk_done({ reason:"MVP runs; tests pass; README + deploy plan present" }) ``` ## State on disk @@ -169,125 +248,152 @@ Everything CLK persists lives under `.clk/`: idea.json # captured idea + timestamp roster.json # current cast: name, mission, persona per role progress.md # human-readable timeline (one line per event) - clk.json # full state snapshot (idea + roster + progress) + clk.json # full state snapshot (idea + roster + progress + homeBranch) done.md # written only when clk_done is called + *.bak # rotated previous version of any of the above + subagents// # per-spawn scratch: task.md + stdout.txt; cleaned up on exit logs/ .log # one log file per clk_subagent call; records spawn, - # tmux start/exit, abort, timeout, and the first 500 + # tmux start/exit, abort, timeout, and the first 2000 # chars of output for post-mortem debugging ``` -The roster, progress log, and full snapshot are also written to Pi's session -JSONL via `pi.appendEntry` — so they're replayed automatically when you -resume a session, and they survive a `pi --resume`. +The roster, progress log, and full snapshot are also written to Pi's +session JSONL via `pi.appendEntry` — so they're replayed automatically +when you resume a session, and they survive a `pi --resume`. -Git commits made by `clk_checkpoint` carry a `[clk]` prefix and are real -commits in the project repo. The chief uses them as Ralph-style baselines and -reverts to them on regression. +Git commits made by `clk_checkpoint` and `clk_merge` carry a `[clk]` +prefix and are real commits in the project repo. The chief uses them +as Ralph-style baselines and reverts to them on regression. ## What you keep from the original CLK -- **Single command, idea-first.** `/clk ` is the only entry point. -- **Dynamic casting.** The chief invents project-specific roles on the fly - with personas and missions it authors itself, persisted to `roster.json`. -- **Stochastic consensus.** High-stakes decisions fan out to parallel - candidates (Pi runs sibling tool calls concurrently by default), then a - judge subagent picks or synthesizes. -- **Ralph refinement loop.** Pre-iteration checkpoint → dispatch → validate - → commit-or-revert. Failed iterations leave no trace in the working tree. -- **Autoresearch loop.** When stuck on open questions, the chief designs and - runs small experiments (researcher / scout / spike) and records learnings - regardless of outcome. -- **Self-healing.** Repeated failure triggers consensus on root cause and - optionally a fresh `clk_cast` to add a specialist who can fix the upstream - issue. +- **Single command, idea-first.** `/clk ` is the only entry + point. +- **Dynamic casting.** The chief invents project-specific roles on the + fly with personas and missions it authors itself, persisted to + `roster.json`. +- **Stochastic consensus (code-enforced).** `clk_consensus` spawns N + parallel tmux subagent samples, scores each via the same regex / + contract rules the Python harness uses, and returns the highest- + scoring winner. The chief can fan out at will rather than relying on + the LLM to remember to emit parallel tool calls. +- **Quality re-dispatch (code-enforced).** `clk_subagent_quality` (and + the consensus pipeline internally) re-roll on recoverable failures + with a repair preamble that quotes the specific flags back to the + worker. +- **Ralph refinement loop (code-enforced).** `clk_ralph` creates the + feature branch and runs the fan-out in one tool call; the chief + decides accept/reject afterwards via `clk_merge` / `clk_revert`. + Failed iterations leave no trace on the home branch — the rejected + branch is preserved for review and never deleted. +- **Karpathy-style autoresearch.** `clk_autoresearch` alternates a + `researcher` and `critic` subagent for N bounded iterations, + recording every finding and critique on the progress log. +- **Memory through git.** Every successful milestone is committed + with a structured message so future agent runs can mine the log for + context. ## What changes from the original CLK -| Original CLK | Pi extension | +| Original CLK (Python harness) | Pi extension | |---|---| -| Provider-agnostic (claude/codex/gemini/ollama/openwebui/pi) | Tied to Pi | -| Curses TUI dashboard with live agent cards | Pi's single conversation stream + status-line entries | -| `ACTION:` block protocol for write/edit/append/delete/run | Pi's built-in `read`/`write`/`edit`/`bash` tools | -| YAML workflows in `.clk/config/workflows/` | None — the chief decides workflow on the fly | -| Per-agent prompt files in `.clk/prompts/` | One operator's manual in `src/prompts.ts`; per-role personas live in `roster.json` | -| Subprocess-piped agents | tmux pi sessions (via the extension's `clk_subagent` tool) | +| Provider-agnostic (claude / codex / gemini / ollama / openwebui / pi / shell) | Tied to Pi (which can route to its own upstream of choice). | +| Curses TUI dashboard with live agent cards + cost meter | Pi's single conversation stream + status-line entries (`clk-idea`, `clk-roster`, `clk-head`, `clk-branch`, `clk-last`, `clk-git`, `clk-run`, `clk-done`). | +| `ACTION:` block protocol for write / edit / append / delete / run | Pi's built-in `read` / `write` / `edit` / `bash` tools. | +| YAML workflows in `.clk/config/workflows/` driven by a workflow runner | The chief decides workflow on the fly using the orchestration tools. | +| Per-agent prompt files in `.clk/prompts/` | One operator's manual in `src/prompts.ts`; per-role personas live in `roster.json`. | +| Subprocess-piped provider adapters | tmux pi sessions (`clk_subagent` and the consensus fan-out spawn the same way). | +| Robustness loops gated by `clk.config.json::robustness.*` | The four orchestration tools (`clk_consensus`, `clk_subagent_quality`, `clk_autoresearch`, `clk_ralph`) implement the equivalent loops directly; their parameters (`samples`, `maxRetries`, `iterations`) act as per-call knobs. | +| `clk_harness/orchestration/response_quality.py` | Same rules, ported to `src/quality.ts`. | +| Telegram bot integration | Out of scope — use the Python harness for that. | +| REST API | Out of scope — use the Python harness for that. | ## Customising orchestration -All policy lives in [`src/prompts.ts`](src/prompts.ts). Edit that file and -`/reload` to change behavior. Useful knobs: +Most policy still lives in [`src/prompts.ts`](src/prompts.ts) (when to +fan out, when to autoresearch, when to start Ralph, when to call +`clk_done`). Edit that file and `/reload` to change behavior. + +Per-call parameters tune the in-code loops directly: -- Add or remove standing rules. -- Change consensus sample counts (default: 3–5). -- Change Ralph soft cap (default: ~10 iterations per stretch). -- Change completion criteria. -- Change how the chief prefixes dynamic personas onto `delegate` tasks. +- `clk_consensus({ samples: 5 })` — 5 parallel samples (1..6). +- `clk_consensus({ minChars: 80 })` — stricter empty-flag threshold. +- `clk_subagent_quality({ maxRetries: 2 })` — up to 3 total dispatches. +- `clk_autoresearch({ iterations: 4 })` — 4 researcher+critic cycles. +- `clk_ralph({ samples: 5 })` — 5-way consensus per Ralph iteration. -The `clk_*` tools are intentionally minimal mechanics. Resist the urge to -encode policy in them — Pi extensions get the most leverage when the LLM -makes the decisions and the extension just provides primitives + persistence. +The quality detector itself is configurable through +`scoreResponse(text, opts)` from [`src/quality.ts`](src/quality.ts) — +the same `ScoreOpts` shape is forwarded by every quality-gated tool. ## Error handling and resilience -The extension is designed to survive transient provider problems without ending -the run. Errors are classified into four categories, each with a defined -recovery path: +The extension is designed to survive transient provider problems +without ending the run. Errors are classified into categories with a +defined recovery path: | Category | Symptoms | Recovery | |----------|----------|----------| -| **Rate limit** | HTTP 429, "too many requests", "quota exceeded" | Exponential backoff, retried indefinitely (delay capped at 5 minutes) until the run is aborted. The chief is also instructed to try a smaller / different model if the limit persists. | +| **Rate limit** | HTTP 429, "too many requests", "quota exceeded" | Exponential backoff in `withRetry`, retried indefinitely (delay capped at 5 minutes) until the user aborts. The chief is also instructed to try a smaller / different model if the limit persists. | | **Model unavailable** | HTTP 404, "model not found", "not available on free tier" | No retry — the chief falls back to a built-in Pi agent (`worker`, `researcher`, `scout`, `oracle`) or omits `preferredModel` and lets Pi choose. | | **Privacy redaction** | `[REDACTED]` values, "privacy filter", "sensitive content blocked" | Tool params are checked for redaction markers before use; the tool returns a recovery hint asking the chief to retry without the sensitive field (or to write it to a file and pass the path). | -| **Max turns exhausted** | "max turns reached", "turn limit", "turn cap", "no more turns" | The chief re-dispatches the identical `clk_subagent` call immediately without asking for confirmation. If the task exhausts turns twice in a row the chief splits it into two narrower sequential subtasks. | +| **Max turns exhausted** | "max turns reached", "turn limit", "turn cap", "no more turns" | The chief re-dispatches the identical `clk_subagent` / `clk_subagent_quality` / `clk_consensus` call immediately. If the task exhausts turns twice in a row the chief splits it into two narrower sequential subtasks. | | **Network / transient** | ECONNRESET, ETIMEDOUT, "socket hang up" | Same backoff-and-retry as rate limits. | +| **Quality-flagged output** | empty / malformed / contract-missing / low-confidence / NEEDS_REVIEW=true | `clk_subagent_quality` re-dispatches with a repair preamble up to `maxRetries`; `clk_consensus` picks the highest-scoring sample even if all are sub-threshold so the chief can see the spread and decide. Refusals are non-recoverable — surfaced to the chief instead of retried. | ### Where this is enforced -- **`src/errors.ts`** — `classifyError` (now includes `max_turns`), `isRetryable`, - `looksRedacted`, `isMaxTurnsResult`, `withRetry` (exponential backoff helper), - and `recoveryHint` (human-readable guidance returned to the chief as tool output). -- **`src/index.ts`** — `pi.sendUserMessage` (the call that hands off to the - chief) is wrapped with `withRetry`; abort-caused errors are distinguished - from real errors so the run lifecycle is handled correctly. -- **`src/tools.ts`** — every `clk_*` tool `execute` function checks input - parameters for redaction before acting and returns a descriptive error result - (rather than throwing) when git operations fail, so the chief can decide how - to proceed. -- **`src/prompts.ts`** — rule 8 (max-turns: re-dispatch immediately or split - the task) and rule 10 (other provider errors) in the chief's operator's manual - instruct it how to handle error results from `clk_subagent` calls (which happen - inside Pi's runtime and cannot be intercepted in TypeScript). +- **`src/errors.ts`** — `classifyError`, `isRetryable`, `looksRedacted`, + `isMaxTurnsResult`, `withRetry` (exponential backoff), `recoveryHint`. +- **`src/quality.ts`** — `scoreResponse`, `repairHint`, `isRecoverable`, + `summarise`. +- **`src/consensus.ts`** — `dispatchWithQuality` (single + retry), + `runConsensus` (parallel fan-out + winner picking). +- **`src/index.ts`** — `pi.sendUserMessage` (the call that hands off to + the chief) is wrapped with `withRetry`; abort-caused errors are + distinguished from real errors so the run lifecycle is handled + correctly. +- **`src/tools.ts`** — every `clk_*` tool `execute` function checks + input parameters for redaction before acting and returns a + descriptive error result (rather than throwing) when git operations + fail, so the chief can decide how to proceed. +- **`src/prompts.ts`** — the chief's operator's manual still instructs + how to react to error results from `clk_subagent` calls (Pi runtime + errors that cannot be intercepted in TypeScript). ### Design principle -A single failed subagent call or tool invocation must never end the run. The -extension recovers what it can in TypeScript, then surfaces a recovery hint to -the chief so it can adapt its plan. Use `/clk-abort` when you intentionally -want to stop. +A single failed subagent call or tool invocation must never end the +run. The extension recovers what it can in TypeScript, then surfaces a +recovery hint to the chief so it can adapt its plan. Use `/clk-abort` +when you intentionally want to stop. ## Limitations / gotchas -- **Subagent depth is capped at one level.** Each spawned tmux pi session - receives a preamble instructing it not to spawn further subagents. The - chief (parent) may create grandchildren on a child's behalf — that is the - maximum nesting depth. No env var controls this; it is enforced by the - task preamble the `clk_subagent` tool prepends. -- **Children should not use CLK tools.** Spawned tmux pi sessions may have - CLK loaded if you have it configured globally. The task preamble prepended - by `clk_subagent` explicitly instructs each child not to spawn further - subagents and not to call `clk_*` tools. This is prompt-level enforcement, - not a technical lock. The chief is the intended sole orchestrator — don't - try to delegate orchestration. -- **Concurrency lock.** Only one `/clk` run can be active per Pi session. - Use `/clk-abort` first if you want to start over with a different idea. +- **Subagent depth is capped at one level.** Each spawned tmux pi + session receives a preamble instructing it not to spawn further + subagents and not to call `clk_*` tools. The chief (parent) may + create grandchildren on a child's behalf — that is the maximum + nesting depth. This is prompt-level enforcement, not a technical + lock. +- **Concurrency lock.** Only one `/clk` run can be active per Pi + session. Use `/clk-abort` first if you want to start over with a + different idea. +- **Subagent timeout.** Each spawned tmux pi session has a 30-minute + hard cap (`SUBAGENT_TIMEOUT_MS` in `src/subagent.ts`). Long-running + experiments should be split into multiple bounded dispatches. +- **Output cap.** Subagent output is truncated at 80,000 characters + before being returned to the chief; the first 2,000 characters of + the full output are kept in `.clk/logs/.log` for + post-mortem. +- **No web TUI.** Pi runs in your terminal; this extension inherits + that. The agent dashboard from the Python CLK is replaced by + status-line entries. - **`ctx.signal` is undefined when `/clk` fires** (the extension is invoked while Pi is idle), so the extension manages its own - `AbortController` and merges it with per-tool signals. Esc + `/clk-abort` - + session shutdown all wire through correctly. -- **No web TUI.** Pi runs in your terminal; this extension inherits that. - The agent dashboard from the Python CLK is replaced by status-line - entries (`clk-roster`, `clk-head`, `clk-last`, `clk-run`, `clk-done`). + `AbortController` and merges it with per-tool signals. Esc + + `/clk-abort` + session shutdown all wire through correctly. ## Repository layout @@ -297,20 +403,58 @@ pi-extension/ package.json # devDeps for editor type-checking; pi loads via jiti tsconfig.json src/ - index.ts # entry: factory, /clk + /clk-abort, session_start replay + index.ts # entry: factory, /clk + /clk-abort + /clk-help + + # /clk-doctor + /clk-undo, session_start replay prompts.ts # the chief's operator's manual (the policy) - tools.ts # clk_cast, clk_progress, clk_checkpoint, - # clk_branch, clk_revert, clk_merge, clk_done - subagent.ts # subagent tool: spawns tmux pi sessions, polls for - # completion, handles cancellation + progress updates + tools.ts # every clk_* tool — clk_cast, clk_progress, + # clk_checkpoint, clk_branch, clk_revert, + # clk_merge, clk_consensus, clk_subagent_quality, + # clk_autoresearch, clk_ralph, clk_done + subagent.ts # clk_subagent + spawnSubagent (tmux pi spawner) + consensus.ts # dispatchWithQuality + runConsensus (parallel + # sample fan-out + quality re-dispatch loop) + quality.ts # scoreResponse + repairHint (port of + # clk_harness/orchestration/response_quality.py) + git.ts # init, checkpoint, branch, merge, revert, + # safety-net installer, hasRemote, commitsAhead, + # pushBestEffort (port of git_ops.py auto-push) state.ts # .clk/state/* persistence + pi.appendEntry mirroring - # (tracks idea, roster, progress, homeBranch) - git.ts # checkpoint, revertTo, head, abortMerge helpers + # (idea, roster, progress, homeBranch) abort.ts # run-scoped AbortController + /clk-abort + shutdown bridge errors.ts # error classification, backoff retry, redaction detection - types.ts # shared types + types.ts # shared types (Roster, ProgressKind, ClkState) + tests/ + errors.test.ts # classifyError / withRetry / recoveryHint + prompts.test.ts # chief primer includes every clk_* tool name + state.test.ts # atomic writes + .bak rotation + round-trip + git.test.ts # real git binary: init, checkpoint, branch, merge, + # revert, hasRemote, commitsAhead, pushBestEffort + quality.test.ts # every flag + repairHint + isRecoverable + consensus.test.ts # injected spawn: ok / retry / non-recoverable / + # fan-out winner picking / clamping / errors / + # maxParallel concurrency + safety_nets.test.ts # gitignore + pre-push hook idempotence + runtime_smoke.test.ts # real pi binary, when available (CI gates this) + index.test.ts # extension factory wires every tool + command, + # firstLineShort handles multi-line ideas ``` +## Testing + +Inside `pi-extension/`: + +```bash +npm ci # install dependencies +npm run typecheck # tsc --noEmit +npm test # 96 tests, ~2s, no network or pi required +npm run test:strict # typecheck + tests in one go +``` + +The full suite runs entirely offline (consensus tests inject a fake +spawn function) so it's safe to run in CI without tmux or pi +installed. The `runtime_smoke.test.ts` self-skips when the real `pi` +binary isn't reachable. + ## License MIT. See the parent repo for the full notice.